summaryrefslogtreecommitdiffstats
path: root/src/crypto/isa-l/isa-l_crypto/aes
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/crypto/isa-l/isa-l_crypto/aes
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/aes')
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/Makefile.am103
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm1777
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm1747
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm1746
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm1778
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm1530
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm1505
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm1504
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm1529
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm1961
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm1895
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm1897
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm1962
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm1707
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm1652
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm1651
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm1707
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm427
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm163
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm157
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm157
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm136
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm150
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm148
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm146
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm140
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm147
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c315
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c56
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h466
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c443
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c183
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm1996
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm1990
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm2033
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm2036
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm2030
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm2074
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm163
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm172
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c272
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c71
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c1937
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c322
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h476
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm320
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm268
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm280
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h300
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c141
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c102
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c142
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c100
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c116
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c207
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c106
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h1691
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c103
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c101
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c113
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c209
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c105
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h1035
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm78
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm78
74 files changed, 53473 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
new file mode 100644
index 00000000..9ae1f0cb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
@@ -0,0 +1,103 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+extern_hdrs += include/aes_gcm.h include/aes_cbc.h include/aes_xts.h include/aes_keyexp.h
+
+lsrc += aes/gcm_multibinary.asm aes/gcm_pre.c
+lsrc += aes/gcm128_avx_gen2.asm aes/gcm128_avx_gen4.asm aes/gcm128_sse.asm
+lsrc += aes/gcm256_avx_gen2.asm aes/gcm256_avx_gen4.asm aes/gcm256_sse.asm
+lsrc += aes/keyexp_multibinary.asm
+lsrc += aes/keyexp_128.asm aes/keyexp_192.asm aes/keyexp_256.asm
+lsrc += aes/cbc_multibinary.asm
+lsrc += aes/cbc_dec_128_x4_sse.asm aes/cbc_dec_128_x8_avx.asm
+lsrc += aes/cbc_dec_192_x4_sse.asm aes/cbc_dec_192_x8_avx.asm
+lsrc += aes/cbc_dec_256_x4_sse.asm aes/cbc_dec_256_x8_avx.asm
+lsrc += aes/cbc_enc_128_x4_sb.asm aes/cbc_enc_128_x8_sb.asm
+lsrc += aes/cbc_enc_192_x4_sb.asm aes/cbc_enc_192_x8_sb.asm
+lsrc += aes/cbc_enc_256_x4_sb.asm aes/cbc_enc_256_x8_sb.asm
+lsrc += aes/cbc_pre.c
+lsrc += aes/xts_aes_128_multibinary.asm
+lsrc += aes/XTS_AES_128_dec_sse.asm aes/XTS_AES_128_dec_expanded_key_sse.asm
+lsrc += aes/XTS_AES_128_enc_sse.asm aes/XTS_AES_128_enc_expanded_key_sse.asm
+lsrc += aes/XTS_AES_128_dec_avx.asm aes/XTS_AES_128_dec_expanded_key_avx.asm
+lsrc += aes/XTS_AES_128_enc_avx.asm aes/XTS_AES_128_enc_expanded_key_avx.asm
+lsrc += aes/xts_aes_256_multibinary.asm
+lsrc += aes/XTS_AES_256_dec_avx.asm aes/XTS_AES_256_dec_expanded_key_avx.asm
+lsrc += aes/XTS_AES_256_enc_avx.asm aes/XTS_AES_256_enc_expanded_key_avx.asm
+lsrc += aes/XTS_AES_256_dec_sse.asm aes/XTS_AES_256_dec_expanded_key_sse.asm
+lsrc += aes/XTS_AES_256_enc_sse.asm aes/XTS_AES_256_enc_expanded_key_sse.asm
+
+other_src += include/multibinary.asm
+other_src += include/test.h include/types.h include/reg_sizes.asm
+other_src += aes/gcm_defines.asm
+other_src += aes/cbc_common.asm aes/cbc_std_vectors.h
+other_src += aes/gcm_vectors.h aes/ossl_helper.h
+other_src += aes/xts_128_vect.h
+other_src += aes/xts_256_vect.h
+
+check_tests += aes/cbc_std_vectors_test
+check_tests += aes/gcm_std_vectors_test
+check_tests += aes/xts_128_test
+check_tests += aes/xts_256_test
+check_tests += aes/xts_128_expanded_key_test
+check_tests += aes/xts_256_expanded_key_test
+
+unit_tests += aes/cbc_std_vectors_random_test
+unit_tests += aes/gcm_std_vectors_random_test
+unit_tests += aes/xts_128_rand aes/xts_128_rand_ossl_test
+unit_tests += aes/xts_256_rand aes/xts_256_rand_ossl_test
+
+perf_tests += aes/cbc_ossl_perf
+perf_tests += aes/gcm_ossl_perf
+perf_tests += aes/xts_128_enc_ossl_perf
+perf_tests += aes/xts_256_enc_ossl_perf
+perf_tests += aes/xts_128_enc_perf aes/xts_128_dec_perf aes/xts_128_dec_ossl_perf
+perf_tests += aes/xts_256_enc_perf aes/xts_256_dec_perf aes/xts_256_dec_ossl_perf
+
+
+cbc_ossl_perf: LDLIBS += -lcrypto
+aes_cbc_ossl_perf_LDFLAGS = -lcrypto
+cbc_std_vectors_random_test: LDLIBS += -lcrypto
+aes_cbc_std_vectors_random_test_LDFLAGS = -lcrypto
+gcm_ossl_perf: LDLIBS += -lcrypto
+aes_gcm_ossl_perf_LDFLAGS = -lcrypto
+gcm_std_vectors_random_test: LDLIBS += -lcrypto
+aes_gcm_std_vectors_random_test_LDFLAGS = -lcrypto
+xts_128_enc_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_128_dec_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_128_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_128_rand_ossl_test_LDFLAGS = -lcrypto
+xts_256_enc_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_256_dec_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_256_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_256_rand_ossl_test_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
new file mode 100644
index 00000000..ebd1646a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
@@ -0,0 +1,1777 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_avx(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_avx:function
+XTS_AES_128_dec_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
new file mode 100644
index 00000000..151113e7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
@@ -0,0 +1,1747 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_expanded_key_avx:function
+XTS_AES_128_dec_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
new file mode 100644
index 00000000..5ac14416
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
@@ -0,0 +1,1746 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*0]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_expanded_key_sse:function
+XTS_AES_128_dec_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
new file mode 100644
index 00000000..1fe56559
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
@@ -0,0 +1,1778 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_sse(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_sse:function
+XTS_AES_128_dec_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;decrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
new file mode 100644
index 00000000..572c6195
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
@@ -0,0 +1,1530 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_avx:function
+XTS_AES_128_enc_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
new file mode 100644
index 00000000..10812333
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
@@ -0,0 +1,1505 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+default rel
+
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_expanded_key_avx:function
+XTS_AES_128_enc_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
new file mode 100644
index 00000000..07cf9f67
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
@@ -0,0 +1,1504 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_expanded_key_sse:function
+XTS_AES_128_enc_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
new file mode 100644
index 00000000..bcdd3a75
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
@@ -0,0 +1,1529 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_sse(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_sse:function
+XTS_AES_128_enc_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
new file mode 100644
index 00000000..767242d6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
@@ -0,0 +1,1961 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_avx:function
+XTS_AES_256_dec_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
new file mode 100644
index 00000000..d5a75d0a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
@@ -0,0 +1,1895 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_expanded_key_avx:function
+XTS_AES_256_dec_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
new file mode 100644
index 00000000..6ee4e337
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
@@ -0,0 +1,1897 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*14]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*13]
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*12]
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*11]
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*11]
+ aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*12]
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*13]
+ aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*14]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*0]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_expanded_key_sse:function
+XTS_AES_256_dec_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdec xmm8, [keys + 16*10] ; round 9
+ aesdec xmm8, [keys + 16*11] ; round 9
+ aesdec xmm8, [keys + 16*12] ; round 9
+ aesdec xmm8, [keys + 16*13] ; round 9
+ aesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
new file mode 100644
index 00000000..6ea1ae8b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
@@ -0,0 +1,1962 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_sse(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_sse:function
+XTS_AES_256_dec_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdec xmm8, [keys + 16*10] ; round 9
+ aesdec xmm8, [keys + 16*11] ; round 9
+ aesdec xmm8, [keys + 16*12] ; round 9
+ aesdec xmm8, [keys + 16*13] ; round 9
+ aesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
new file mode 100644
index 00000000..e0c1d493
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
@@ -0,0 +1,1707 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_avx:function
+XTS_AES_256_enc_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
new file mode 100644
index 00000000..e4ace946
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
@@ -0,0 +1,1652 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_expanded_key_avx:function
+XTS_AES_256_enc_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
new file mode 100644
index 00000000..47fe6528
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
@@ -0,0 +1,1651 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*11]
+ aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*11]
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*12]
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*12]
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*13]
+ aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*13]
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*14]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*14]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_expanded_key_sse:function
+XTS_AES_256_enc_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenc xmm8, [keys + 16*10] ; round 9
+ aesenc xmm8, [keys + 16*11] ; round 9
+ aesenc xmm8, [keys + 16*12] ; round 9
+ aesenc xmm8, [keys + 16*13] ; round 9
+ aesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
new file mode 100644
index 00000000..244c33b3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
@@ -0,0 +1,1707 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_sse(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_sse:function
+XTS_AES_256_enc_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenc xmm8, [keys + 16*10] ; round 9
+ aesenc xmm8, [keys + 16*11] ; round 9
+ aesenc xmm8, [keys + 16*12] ; round 9
+ aesenc xmm8, [keys + 16*13] ; round 9
+ aesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
new file mode 100644
index 00000000..2fbc3b2a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
@@ -0,0 +1,427 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; the following defines control the operation of the macros below and
+; need to be defines in the including file
+; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15
+; EARLY_BLOCKS - number of data block to load before starting computations
+; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data
+; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1
+; TMP_CNT - number of tmp xmm register to reserve
+; XMM_USAGE - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2
+;
+
+%include "reg_sizes.asm"
+
+;
+; the following instructions set specific macros must be defined in the user file
+; to make use of the AES macros below
+; MOVDQ - move from memory to xmm reg
+; PXOR - XOR of two xmm registers pxor
+; AES_DEC - AES block decode for early key rounds
+; AES_DEC_LAST - AES block decode for last key round
+; or
+; AES_ENC - AES block encode for early key rounds
+; AES_ENC_LAST - AES block encode for last key round
+
+; Three usages of xmm regs: key round cache, blocks data and one temp
+; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg
+%assign FIRST_XDATA (0)
+%assign IV_IDX (FIRST_XDATA + PARALLEL_BLOCKS)
+%ifndef IV_CNT
+%define IV_CNT (1)
+%endif
+%assign TMP (IV_IDX + IV_CNT)
+%assign TMP_CNT (2)
+%assign FIRST_CKEY (TMP + TMP_CNT)
+%assign CKEY_CNT (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT))
+
+; Abstract xmm register usages that identify the expected contents of the register
+%define reg(i) xmm %+ i
+%define XDATA(i) xmm %+ i
+%define KEY_REG(i) xmm %+ i
+%define IV_REG(i) xmm %+ i
+
+%define IDX rax
+
+
+
+
+;
+;
+; AES CBC ENCODE MACROS
+;
+;
+
+;
+; CBC_DECRYPT_BLOCKS
+; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro
+; Finalized the decryption and saves results in the output
+; places last last buffers crypto text in IV for next buffer
+; updates the index and number of bytes left
+;
+%macro CBC_DECRYPT_BLOCKS 17
+%define %%TOT_ROUNDS %1
+%define %%num_blocks %2 ; can be 0..13
+%define %%EARLY_LOADS %3 ; number of data blocks to laod before processing
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%CACHED_KEYS %8 ; number of key data cached in xmm regs
+%define %%TMP %9
+%define %%TMP_CNT %10
+%define %%FIRST_CKEY %11
+%define %%KEY_DATA %12
+%define %%FIRST_XDATA %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+%define %%LEN %17
+
+ AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX
+
+ ;
+ ; XOR the result of each block's decrypt with the previous block's cypher text (C)
+ ;
+ %assign i 0
+ %rep (%%num_blocks)
+ %%PXOR XDATA(i), XDATA(IV_IDX) ; XOR result with previous block's C
+ %%MOVDQ [%%OUT + %%IDX + i*16], XDATA(i) ; save plain text to out
+ %%MOVDQ XDATA(IV_IDX), [%%IN + IDX + i*16] ; load IV with current block C
+ %assign i (i+1)
+ %endrep
+
+ add %%IDX, %%num_blocks*16
+ sub %%LEN, %%num_blocks*16
+%endmacro
+
+
+;
+; CBC_ENC_INIT
+; XOR first data block with the IV data
+%macro CBC_ENC_INIT 7
+%define %%P_FIRST %1
+%define %%IV_IDX %2
+%define %%MOVDQ %3
+%define %%PXOR %4
+%define %%IV %5
+%define %%IN %6 ; input data
+%define %%IDX %7 ; index into input and output data buffers
+
+ %%MOVDQ XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16]
+ %%MOVDQ reg(%%IV_IDX), [%%IV]
+ %%PXOR XDATA(%%P_FIRST), reg(%%IV_IDX)
+%endmacro
+
+;
+; assumptions:
+; LEN is length of data remaining
+; IDX is offset into the data buffer
+;
+; subloops
+; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(P_FIRST)
+; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST))
+; save current (XDATA(P_FIRST))
+; update indexes for P_FIRST
+; end if data zero
+;
+%macro CBC_ENC_SUBLOOP 17
+%define %%TOT_ROUNDS %1
+%define %%BLOCKS %2 ; can be 1...14
+%define %%START_DATA %3
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%TMP %8
+%define %%TMP_CNT %9
+%define %%FIRST_CKEY %10
+%define %%CKEY_CNT %11
+%define %%KEYS %12
+%define %%CACHED_KEYS %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+%define %%LEN %17
+
+ %assign this_blk 0
+ %assign next_blk 1
+ %assign p_first %%START_DATA
+ %assign p_next (p_first+1)
+ ; for number of blocks to be processed in a loop
+ %assign blk 1
+ %rep %%BLOCKS
+ ; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+ cmp %%LEN, 16
+ %push skip_read
+ je %$skip_read_next
+ %%MOVDQ XDATA(p_next), [%%IN + %%IDX + next_blk*16]
+ %$skip_read_next:
+ %pop
+
+ AES_ENC_BLOCKS %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST
+
+ ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first))
+ cmp %%LEN, 16
+ %push skip_next
+ je %$skip_next_blk_start
+ %%PXOR XDATA(p_next), XDATA(p_first)
+ %$skip_next_blk_start:
+ %pop
+
+ ; save current (XDATA(p_first))
+ %%MOVDQ [%%OUT + %%IDX + this_blk*16], XDATA(p_first)
+ ; update indexes for p_first
+ add %%IDX, 16
+ sub %%LEN, 16
+
+ %if (blk < %%BLOCKS) ; only insert jz if NOT last block
+ ; end if data zero
+ jz %%END_CBC_ENC_SUBLOOP
+ %endif ; (p_next < %%BLOCKS)
+
+ %assign p_first (p_next)
+ %assign blk (blk+1)
+ %if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA
+ %assign p_next (%%START_DATA)
+ %elif (1 == %%BLOCKS)
+ %%MOVDQ XDATA(%%START_DATA), XDATA(p_next)
+ %else
+ %assign p_next (p_next+1)
+ %endif
+ %endrep ; %%BLOCKS
+
+ %%END_CBC_ENC_SUBLOOP:
+%endm ; CBC_ENC_SUBLOOP
+
+
+;
+;
+; AES BLOCK ENCODE MACROS
+;
+;
+
+;
+; FILL_KEY_CACHE
+; Load key data into the cache key xmm regs
+%macro FILL_KEY_CACHE 4
+%define %%CACHED_KEYS %1
+%define %%CKEY_START %2
+%define %%KEY_DATA %3
+%define %%MOVDQ %4
+
+ %assign rnd 0
+ %rep KEY_ROUNDS
+ %if (rnd < %%CACHED_KEYS) ; find the round's key data
+ %assign c (rnd + %%CKEY_START)
+ %%MOVDQ KEY_REG(c), [%%KEY_DATA + rnd*16] ;load sub key into an available register
+ %endif
+ %assign rnd (rnd+1)
+ %endrep
+%endmacro
+
+;
+; SCHEDULE_DATA_LOAD
+; pre-loades message data into xmm regs
+; updates global 'blocks_loaded' that tracks which data blocks have been loaded
+; 'blocks_loaded' is an in/out global and must be declared in the using macro or function
+%macro SCHEDULE_DATA_LOAD 5
+%define %%PARALLEL_DATA %1
+%define %%EARLY_LOADS %2
+%define %%MOVDQ %3
+%define %%IN %4
+%define %%IDX %5
+
+ %if (blocks_loaded < %%PARALLEL_DATA)
+ ; load cipher text
+ %%MOVDQ XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16]
+ %assign blocks_loaded (blocks_loaded+1)
+ %endif ; (blocks_loaded < %%PARALLEL_DATA)
+%endmacro ; SCHEDULED_EARLY_DATA_LOADS
+
+;
+; INIT_SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro INIT_SELECT_KEY 6
+%define %%TOT_ROUNDS %1
+%define %%CACHED_KEYS %2
+%define %%KEY_DATA %3
+%define %%FIRST_TMP %4
+%define %%TMP_CNT %5
+%define %%MOVDQ %6
+
+ %assign current_tmp (%%FIRST_TMP)
+ %if (%%TOT_ROUNDS > %%CACHED_KEYS) ; load the first uncached key into temp reg
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16]
+ %endif ; (KEY_ROUNDS > CKEY_CNT)
+%endmacro ; SELECT_KEY
+
+;
+; SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro SELECT_KEY 8
+%define %%ROUND %1
+%define %%TOT_ROUNDS %2
+%define %%CACHED_KEYS %3
+%define %%FIRST_KEY %4
+%define %%KEY_DATA %5
+%define %%FIRST_TMP %6
+%define %%TMP_CNT %7
+%define %%MOVDQ %8
+
+ ; find the key data for this round
+ %if (%%ROUND < %%CACHED_KEYS) ; is it cached
+ %assign key (%%ROUND + %%FIRST_KEY)
+ %else
+ ; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one
+ %assign key (current_tmp) ; use the previous loaded key data
+ %if (1 == %%TMP_CNT)
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data
+ %else
+ %assign next_round (%%ROUND+1)
+ %if (next_round < %%TOT_ROUNDS) ; if more rounds to be done
+ %if (current_tmp == %%FIRST_TMP) ; calc the next temp reg to use
+ %assign current_tmp (current_tmp + 1)
+ %else
+ %assign current_tmp (%%FIRST_TMP)
+ %endif ; (current_tmp == %%FIRST_TMP)
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data
+
+ %endif ; (%%ROUND < KEY_ROUNDS)
+ %endif ; (1 < %%TMP_CNT)
+ %endif ; (%%ROUND < %%CACHED_KEYS)
+%endmacro ; SELECT_KEY
+
+
+;
+; AES_PARALLEL_ENC_BLOCKS
+; preloads some data blocks to be worked on
+; starts the aes block encoding while loading the other blocks to be done in parallel
+; aes block encodes each key round on each block
+%macro AES_PARALLEL_ENC_BLOCKS 16
+%define %%KEY_ROUNDS %1
+%define %%PARALLEL_DATA %2
+%define %%EARLY_LOADS %3
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%CACHED_KEYS %8
+%define %%TMP %9
+%define %%TMP_CNT %10
+%define %%FIRST_CKEY %11
+%define %%KEY_DATA %12
+%define %%FIRST_XDATA %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+
+ %assign blocks_loaded 0
+
+ %rep %%EARLY_LOADS
+ SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded
+ %endrep ; %%EARLY_LOADS
+
+ %assign current_tmp (TMP)
+ INIT_SELECT_KEY %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign round 0
+ %assign key 0
+ %rep KEY_ROUNDS ; for all key rounds
+ SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign i %%FIRST_XDATA
+ %rep %%PARALLEL_DATA ; for each block do the EAS block encode step
+ %if (0 == round)
+ %%PXOR XDATA(i), KEY_REG(key) ; first round's step
+ SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX
+
+ %elif ( (%%KEY_ROUNDS-1) == round )
+ %%AES_DEC_LAST XDATA(i), KEY_REG(key) ; last round's step
+
+ %else
+ %%AES_DEC XDATA(i), KEY_REG(key) ; middle round's (1..last-1) step
+
+ %endif
+ %assign i (i+1)
+ %endrep ;%%PARALLEL_DATA
+ %assign round (round+1)
+ %endrep ;KEY_ROUNDS
+%endmacro ; AES_PARALLEL_ENC_BLOCKS
+
+
+
+;
+; AES_ENC_BLOCKS
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(p_first)
+; before using uncached key in TMP0, load next key in TMP1
+; before using uncached key in TMP1, load next key in TMP0
+%macro AES_ENC_BLOCKS 11
+%define %%TOT_ROUNDS %1
+%define %%ENC_BLOCK %2
+%define %%TMP %3
+%define %%TMP_CNT %4
+%define %%FIRST_CKEY %5
+%define %%CACHED_KEYS %6
+%define %%KEY_DATA %7
+%define %%MOVDQ %8
+%define %%PXOR %9
+%define %%AES_ENC %10
+%define %%AES_ENC_LAST %11
+
+ %assign current_tmp (%%TMP)
+ INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign round 0
+ %assign key (round + %%FIRST_CKEY)
+ %rep %%TOT_ROUNDS ; for all key rounds
+ ; find the key data for this round
+ SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ ; encrypt block
+ %if (0 == round)
+ %%PXOR XDATA(%%ENC_BLOCK), KEY_REG(key) ; round zero step
+ %elif ( (%%TOT_ROUNDS-1) == round )
+ %%AES_ENC_LAST XDATA(%%ENC_BLOCK), KEY_REG(key) ; last round's step
+ %else
+ %%AES_ENC XDATA(%%ENC_BLOCK), KEY_REG(key) ; rounds 1..last-1 step
+ %endif ; (0 == round)
+
+ %assign round (round+1)
+ %endrep ; KEY_ROUNDS
+%endmacro ; AES_ENC
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
new file mode 100644
index 00000000..3b8a136e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
@@ -0,0 +1,161 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_128_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys,
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC macros
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (8)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+%include "cbc_common.asm"
+
+section .text
+
+align 16
+global aes_cbc_dec_128_sse:function
+func(aes_cbc_dec_128_sse)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
new file mode 100644
index 00000000..a41d3900
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
@@ -0,0 +1,161 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; routine to do AES128 CBC decrypt
+;; clobbers xmm0-15
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+section .text
+
+;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+global aes_cbc_dec_128_avx:function
+func(aes_cbc_dec_128_avx)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
new file mode 100644
index 00000000..eedff870
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
@@ -0,0 +1,163 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_192_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys[13], // +1 over key length
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (5)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+
+%include "cbc_common.asm"
+
+section .text
+
+global aes_cbc_dec_192_sse:function
+func(aes_cbc_dec_192_sse)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
new file mode 100644
index 00000000..3de1cbca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
@@ -0,0 +1,157 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES192 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+global aes_cbc_dec_192_avx:function
+func(aes_cbc_dec_192_avx)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
new file mode 100644
index 00000000..b6c081ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
@@ -0,0 +1,160 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_256_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys,
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: rcx: pointer to input (cipher text)
+; arg 2: rdx: pointer to IV
+; arg 3: r8: pointer to keys
+; arg 4: r9: pointer to output (plain text)
+; arg 5: sp: length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+
+%include "cbc_common.asm"
+
+global aes_cbc_dec_256_sse:function
+func(aes_cbc_dec_256_sse)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
new file mode 100644
index 00000000..52efa3f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
@@ -0,0 +1,157 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES256 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+global aes_cbc_dec_256_avx:function
+func(aes_cbc_dec_256_avx)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
new file mode 100644
index 00000000..b3cdd834
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
@@ -0,0 +1,136 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_128_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t *keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define EARLY_BLOCKS (2)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_128_x4:function
+func(aes_cbc_enc_128_x4)
+ FUNC_SAVE
+
+ mov IDX, 0
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
new file mode 100644
index 00000000..a6be6df5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
@@ -0,0 +1,150 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x8(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_128_x8:function
+func(aes_cbc_enc_128_x8)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
new file mode 100644
index 00000000..cfaf83ba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
@@ -0,0 +1,148 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_192_x4:function
+func(aes_cbc_enc_192_x4)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
new file mode 100644
index 00000000..ed72bbe9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
@@ -0,0 +1,146 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x8(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+global aes_cbc_enc_192_x8:function
+func(aes_cbc_enc_192_x8)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
new file mode 100644
index 00000000..dd0ea562
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
@@ -0,0 +1,140 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_256_x4:function
+func(aes_cbc_enc_256_x4)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
new file mode 100644
index 00000000..74ad399d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
@@ -0,0 +1,147 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_256_x8:function
+func(aes_cbc_enc_256_x8)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
new file mode 100644
index 00000000..fc458ea4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
@@ -0,0 +1,83 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+extern aes_cbc_dec_128_sse
+extern aes_cbc_dec_128_avx
+extern aes_cbc_dec_192_sse
+extern aes_cbc_dec_192_avx
+extern aes_cbc_dec_256_sse
+extern aes_cbc_dec_256_avx
+
+
+extern aes_cbc_enc_128_x4
+extern aes_cbc_enc_128_x8
+extern aes_cbc_enc_192_x4
+extern aes_cbc_enc_192_x8
+extern aes_cbc_enc_256_x4
+extern aes_cbc_enc_256_x8
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_cbc interfaces enc and dec
+;;;;
+mbin_interface aes_cbc_dec_128
+mbin_dispatch_init aes_cbc_dec_128, aes_cbc_dec_128_sse, aes_cbc_dec_128_avx, aes_cbc_dec_128_avx
+mbin_interface aes_cbc_dec_192
+mbin_dispatch_init aes_cbc_dec_192, aes_cbc_dec_192_sse, aes_cbc_dec_192_avx, aes_cbc_dec_192_avx
+mbin_interface aes_cbc_dec_256
+mbin_dispatch_init aes_cbc_dec_256, aes_cbc_dec_256_sse, aes_cbc_dec_256_avx, aes_cbc_dec_256_avx
+
+mbin_interface aes_cbc_enc_128
+mbin_dispatch_init aes_cbc_enc_128, aes_cbc_enc_128_x4, aes_cbc_enc_128_x8, aes_cbc_enc_128_x8
+mbin_interface aes_cbc_enc_192
+mbin_dispatch_init aes_cbc_enc_192, aes_cbc_enc_192_x4, aes_cbc_enc_192_x8, aes_cbc_enc_192_x8
+mbin_interface aes_cbc_enc_256
+mbin_dispatch_init aes_cbc_enc_256, aes_cbc_enc_256_x4, aes_cbc_enc_256_x8, aes_cbc_enc_256_x8
+
+
+
+;;; func core, ver, snum
+slversion aes_cbc_enc_128, 00, 00, 0291
+slversion aes_cbc_dec_128, 00, 00, 0292
+slversion aes_cbc_enc_192, 00, 00, 0293
+slversion aes_cbc_dec_192, 00, 00, 0294
+slversion aes_cbc_enc_256, 00, 00, 0295
+slversion aes_cbc_dec_256, 00, 00, 0296
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
new file mode 100644
index 00000000..017e523d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
@@ -0,0 +1,315 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_cbc.h>
+#include <test.h>
+#include "ossl_helper.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static unsigned char const ic[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f
+};
+
+static unsigned char *plaintext, *cbc_plaintext, *cyphertext, *ossl_plaintext,
+ *ossl_cyphertext;
+static uint8_t test_key[CBC_256_BITS];
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int aes_128_perf(uint8_t * key)
+{
+ int i;
+
+ /* Initialize our cipher context, which can use same input vectors */
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+
+ aes_cbc_precomp(key, 128, key_data);
+ aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_128(plaintext, iv, key_data->enc_keys,
+ plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_128_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_128_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_128(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_128_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_128_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_128_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int aes_192_perf(uint8_t * key)
+{
+ int i;
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_precomp(key, 192, key_data);
+ aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_192(plaintext, iv, key_data->enc_keys,
+ cyphertext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_192_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_192_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_192(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_192_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_192_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_192_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int aes_256_perf(uint8_t * key)
+{
+ int i;
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ aes_cbc_precomp(key, 256, key_data);
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_256(plaintext, iv, key_data->enc_keys,
+ cyphertext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_256 encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_256_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_256(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_256 decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_256_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ srand(TEST_SEED);
+
+ plaintext = malloc(TEST_LEN);
+ cbc_plaintext = malloc(TEST_LEN);
+ cyphertext = malloc(TEST_LEN);
+ ossl_plaintext = malloc(TEST_LEN);
+ ossl_cyphertext = malloc(TEST_LEN);
+ if (NULL == plaintext || NULL == cyphertext || NULL == cbc_plaintext
+ || NULL == ossl_plaintext || NULL == ossl_cyphertext) {
+ printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+ return 1;
+ }
+
+ mk_rand_data(plaintext, TEST_LEN);
+ mk_rand_data(test_key, sizeof(test_key));
+ printf("AES CBC ISA-L vs OpenSSL performance:\n");
+ OK += aes_128_perf(test_key);
+ OK += aes_192_perf(test_key);
+ OK += aes_256_perf(test_key);
+
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
new file mode 100644
index 00000000..6284d905
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
@@ -0,0 +1,56 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_cbc.h>
+#include <aes_keyexp.h>
+
+int aes_cbc_precomp(uint8_t * key, int key_size, struct cbc_key_data *keys_blk)
+{
+ if (CBC_128_BITS == key_size) {
+ aes_keyexp_128(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else if (CBC_192_BITS == key_size) {
+ aes_keyexp_192(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else if (CBC_256_BITS == key_size) {
+ aes_keyexp_256(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else {
+ //Invalid key length
+ return 1;
+ }
+ return 0;
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver aes_cbc_precomp_slver_00000297;
+struct slver aes_cbc_precomp_slver = { 0x0297, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
new file mode 100644
index 00000000..981aae96
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
@@ -0,0 +1,466 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_CBC_STD_VECTORS_H_
+#define AES_CBC_STD_VECTORS_H_
+#include <aes_cbc.h>
+
+
+// struct to hold pointers to the cbc data vectors
+struct cbc_vector {
+ uint8_t* K; // AES Key
+ cbc_key_size K_LEN; // length of key in bits
+ uint8_t* IV; // initial value used by GCM
+ uint64_t P_LEN; // length of our plaintext
+ uint8_t* P; // Plain text
+ //outputs of encryption
+ uint8_t* EXP_C; // same length as P
+ // used in vector checks, not populated in std vector array
+ uint8_t *C;
+ struct cbc_key_data *KEYS;
+};
+
+
+///////////////////////////////////////////
+// Test vectors from:
+// Intel IPSec library 1..3
+//
+///////////////////////////////////////////
+static unsigned char K1[] = {
+ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
+};
+static unsigned char IV1[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P1[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C1[] = {
+ 0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d,
+ 0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2,
+ 0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16,
+ 0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7
+};
+
+static unsigned char K2[] = {
+ 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+ 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4
+};
+static unsigned char IV2[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P2[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C2[] = {
+ 0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6,
+ 0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d,
+ 0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61,
+ 0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b
+};
+
+static unsigned char K3[] = {
+ 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+ 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7
+};
+static unsigned char IV3[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P3[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C3[] = {
+ 0x17, 0x70, 0x1a, 0x9d, 0x29, 0xc9, 0x1a, 0x94, 0xce, 0xed, 0x72, 0x3c, 0x34, 0xe8,
+ 0x7a, 0xbe, 0x1c, 0x96, 0x84, 0x5c, 0xa8, 0xb7, 0xe8, 0x58, 0x6d, 0xfe, 0xf2, 0xfa,
+ 0x6b, 0xed, 0x24, 0x09, 0x8a, 0x52, 0xce, 0xe8, 0xd7, 0x6d, 0xb6, 0x7b, 0xfd, 0xe2,
+ 0x15, 0x53, 0xd3, 0x1c, 0x28, 0x33, 0xf7, 0x7e, 0xb5, 0x95, 0x00, 0xac, 0x49, 0x03,
+ 0xbc, 0x70, 0x76, 0xb1, 0x84, 0x65, 0xd0, 0xea
+};
+
+///////////////////////////////////////////
+// Test vectors from:
+// 'https://tools.ietf.org/html/rfc3602#section-3.2'
+// The AES-CBC Cipher Algorithm and Its Use with IPsec
+//
+///////////////////////////////////////////
+/*
+Case #1: Encrypting 16 bytes (1 block) using AES-CBC with 128-bit key
+Key : 0x06a9214036b8a15b512e03d534120006
+IV : 0x3dafba429d9eb430b422da802c9fac41
+Plaintext : "Single block msg"
+Ciphertext: 0xe353779c1079aeb82708942dbe77181a
+ *
+ */
+static unsigned char K4[] = {
+ 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b, 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06
+};
+static unsigned char IV4[] = {
+ 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30, 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41
+};
+static unsigned char P4[] = {
+ "Single block msg"
+};
+static unsigned char C4[] = {
+ 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8, 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a
+};
+
+/*
+Case #2: Encrypting 32 bytes (2 blocks) using AES-CBC with 128-bit key
+Key : 0xc286696d887c9aa0611bbb3e2025a45a
+IV : 0x562e17996d093d28ddb3ba695a2e6f58
+Plaintext : 0x000102030405060708090a0b0c0d0e0f
+ 101112131415161718191a1b1c1d1e1f
+Ciphertext: 0xd296cd94c2cccf8a3a863028b5e1dc0a
+ 7586602d253cfff91b8266bea6d61ab1
+*/
+static unsigned char K5[] = {
+ 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0, 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a
+};
+static unsigned char IV5[] = {
+ 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28, 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58
+};
+static unsigned char P5[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b,
+ 0x1c, 0x1d, 0x1e, 0x1f
+};
+static unsigned char C5[] = {
+ 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a, 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1,
+ 0xdc, 0x0a, 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9, 0x1b, 0x82, 0x66, 0xbe,
+ 0xa6, 0xd6, 0x1a, 0xb1
+};
+
+/*
+Case #3: Encrypting 48 bytes (3 blocks) using AES-CBC with 128-bit key
+Key : 0x6c3ea0477630ce21a2ce334aa746c2cd
+IV : 0xc782dc4c098c66cbd9cd27d825682c81
+Plaintext : "This is a 48-byte message (exactly 3 AES blocks)"
+Ciphertext: 0xd0a02b3836451753d493665d33f0e886
+ 2dea54cdb293abc7506939276772f8d5
+ 021c19216bad525c8579695d83ba2684
+
+ */
+static unsigned char K6[] = {
+ 0x6c, 0x3e, 0xa0, 0x47, 0x76, 0x30, 0xce, 0x21, 0xa2, 0xce, 0x33, 0x4a, 0xa7, 0x46, 0xc2, 0xcd
+};
+static unsigned char IV6[] = {
+ 0xc7, 0x82, 0xdc, 0x4c, 0x09, 0x8c, 0x66, 0xcb, 0xd9, 0xcd, 0x27, 0xd8, 0x25, 0x68, 0x2c, 0x81
+};
+static unsigned char P6[] = {
+ "This is a 48-byte message (exactly 3 AES blocks)"
+};
+static unsigned char C6[] = {
+ 0xd0, 0xa0, 0x2b, 0x38, 0x36, 0x45, 0x17, 0x53, 0xd4, 0x93, 0x66, 0x5d, 0x33, 0xf0, 0xe8, 0x86,
+ 0x2d, 0xea, 0x54, 0xcd, 0xb2, 0x93, 0xab, 0xc7, 0x50, 0x69, 0x39, 0x27, 0x67, 0x72, 0xf8, 0xd5,
+ 0x02, 0x1c, 0x19, 0x21, 0x6b, 0xad, 0x52, 0x5c, 0x85, 0x79, 0x69, 0x5d, 0x83, 0xba, 0x26, 0x84
+};
+
+/*
+Case #4: Encrypting 64 bytes (4 blocks) using AES-CBC with 128-bit key
+Key : 0x56e47a38c5598974bc46903dba290349
+IV : 0x8ce82eefbea0da3c44699ed7db51b7d9
+Plaintext : 0xa0a1a2a3a4a5a6a7a8a9aaabacadaeaf
+ b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+Ciphertext: 0xc30e32ffedc0774e6aff6af0869f71aa
+ 0f3af07a9a31a9c684db207eb0ef8e4e
+ 35907aa632c3ffdf868bb7b29d3d46ad
+ 83ce9f9a102ee99d49a53e87f4c3da55
+ */
+static unsigned char K7[] = {
+ 0x56, 0xe4, 0x7a, 0x38, 0xc5, 0x59, 0x89, 0x74, 0xbc, 0x46, 0x90, 0x3d, 0xba, 0x29, 0x03, 0x49
+};
+static unsigned char IV7[] = {
+ 0x8c, 0xe8, 0x2e, 0xef, 0xbe, 0xa0, 0xda, 0x3c, 0x44, 0x69, 0x9e, 0xd7, 0xdb, 0x51, 0xb7, 0xd9
+};
+static unsigned char P7[] = {
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf
+};
+static unsigned char C7[] = {
+ 0xc3, 0x0e, 0x32, 0xff, 0xed, 0xc0, 0x77, 0x4e, 0x6a, 0xff, 0x6a, 0xf0, 0x86, 0x9f, 0x71, 0xaa,
+ 0x0f, 0x3a, 0xf0, 0x7a, 0x9a, 0x31, 0xa9, 0xc6, 0x84, 0xdb, 0x20, 0x7e, 0xb0, 0xef, 0x8e, 0x4e,
+ 0x35, 0x90, 0x7a, 0xa6, 0x32, 0xc3, 0xff, 0xdf, 0x86, 0x8b, 0xb7, 0xb2, 0x9d, 0x3d, 0x46, 0xad,
+ 0x83, 0xce, 0x9f, 0x9a, 0x10, 0x2e, 0xe9, 0x9d, 0x49, 0xa5, 0x3e, 0x87, 0xf4, 0xc3, 0xda, 0x55
+};
+
+/*
+Case #5: Sample transport-mode ESP packet (ping 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 1
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+
+Original packet:
+IP header (20 bytes): 45000054 08f20000 4001f9fe c0a87b03 c0a87b64
+Data (64 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a0b0c 0d0e
+Pad length: 0e
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (80 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+01020304 05060708 090a0b0c 0d0e0e01
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 08f20000 4032f9a5 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000001
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+Encrypted Data (80 bytes):
+f663c25d 325c18c6 a9453e19 4e120849 a4870b66 cc6b9965 330013b4 898dc856
+a4699e52 3a55db08 0b59ec3a 8e4b7e52 775b07d1 db34ed9c 538ab50c 551b874a
+a269add0 47ad2d59 13ac19b7 cfbad4a6
+*/
+static unsigned char K8[] = {
+ 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV8[] = {
+ 0xe9, 0x6e, 0x8c, 0x08, 0xab, 0x46, 0x57, 0x63, 0xfd, 0x09, 0x8d, 0x45, 0xdd, 0x3f, 0xf8, 0x93
+};
+static unsigned char P8[] = {
+ 0x08, 0x00, 0x0e, 0xbd, 0xa7, 0x0a, 0x00, 0x00, 0x8e, 0x9c, 0x08, 0x3d, 0xb9, 0x5b, 0x07, 0x00,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x01
+};
+static unsigned char C8[] = {
+ 0xf6, 0x63, 0xc2, 0x5d, 0x32, 0x5c, 0x18, 0xc6, 0xa9, 0x45, 0x3e, 0x19, 0x4e, 0x12, 0x08, 0x49,
+ 0xa4, 0x87, 0x0b, 0x66, 0xcc, 0x6b, 0x99, 0x65, 0x33, 0x00, 0x13, 0xb4, 0x89, 0x8d, 0xc8, 0x56,
+ 0xa4, 0x69, 0x9e, 0x52, 0x3a, 0x55, 0xdb, 0x08, 0x0b, 0x59, 0xec, 0x3a, 0x8e, 0x4b, 0x7e, 0x52,
+ 0x77, 0x5b, 0x07, 0xd1, 0xdb, 0x34, 0xed, 0x9c, 0x53, 0x8a, 0xb5, 0x0c, 0x55, 0x1b, 0x87, 0x4a,
+ 0xa2, 0x69, 0xad, 0xd0, 0x47, 0xad, 0x2d, 0x59, 0x13, 0xac, 0x19, 0xb7, 0xcf, 0xba, 0xd4, 0xa6
+};
+
+/*
+Case #6: Sample transport-mode ESP packet
+ (ping -p 77 -s 20 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 8
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+
+Original packet:
+IP header (20 bytes): 45000030 08fe0000 4001fa16 c0a87b03 c0a87b64
+Data (28 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777
+
+Augment data with:
+Padding: 0102
+Pad length: 02
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (32 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 01020201
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500004c 08fe0000 4032f9c9 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000008
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+Encrypted Data (32 bytes):
+f5199588 1ec4e0c4 488987ce 742e8109 689bb379 d2d750c0 d915dca3 46a89f75
+ */
+static unsigned char K9[] = {
+ 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV9[] = {
+ 0x69, 0xd0, 0x8d, 0xf7, 0xd2, 0x03, 0x32, 0x9d, 0xb0, 0x93, 0xfc, 0x49, 0x24, 0xe5, 0xbd, 0x80
+};
+static unsigned char P9[] = {
+ 0x08, 0x00, 0xb5, 0xe8, 0xa8, 0x0a, 0x05, 0x00, 0xa6, 0x9c, 0x08, 0x3d, 0x0b, 0x66, 0x0e, 0x00,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x01, 0x02, 0x02, 0x01
+};
+static unsigned char C9[] = {
+ 0xf5, 0x19, 0x95, 0x88, 0x1e, 0xc4, 0xe0, 0xc4, 0x48, 0x89, 0x87, 0xce, 0x74, 0x2e, 0x81, 0x09,
+ 0x68, 0x9b, 0xb3, 0x79, 0xd2, 0xd7, 0x50, 0xc0, 0xd9, 0x15, 0xdc, 0xa3, 0x46, 0xa8, 0x9f, 0x75
+};
+
+/*
+Case #7: Sample tunnel-mode ESP packet (ping 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 2
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+
+Original packet:
+IP header (20 bytes): 45000054 09040000 4001f988 c0a87b03 c0a87bc8
+Data (64 bytes):
+08009f76 a90a0100 b49c083d 02a20400 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+ next header (96 bytes):
+45000054 09040000 4001f988 c0a87b03 c0a87bc8 08009f76 a90a0100 b49c083d
+02a20400 08090a0b 0c0d0e0f 10111213 14151617 18191a1b 1c1d1e1f 20212223
+24252627 28292a2b 2c2d2e2f 30313233 34353637 01020304 05060708 090a0a04
+
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500008c 09050000 4032f91e c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000002
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+Encrypted Data (96 bytes):
+773b5241 a4c44922 5e4f3ce5 ed611b0c 237ca96c f74a9301 3c1b0ea1 a0cf70f8
+e4ecaec7 8ac53aad 7a0f022b 859243c6 47752e94 a859352b 8a4d4d2d ecd136e5
+c177f132 ad3fbfb2 201ac990 4c74ee0a 109e0ca1 e4dfe9d5 a100b842 f1c22f0d
+ */
+static unsigned char K10[] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV10[] = {
+ 0xf4, 0xe7, 0x65, 0x24, 0x4f, 0x64, 0x07, 0xad, 0xf1, 0x3d, 0xc1, 0x38, 0x0f, 0x67, 0x3f, 0x37
+};
+static unsigned char P10[] = {
+ 0x45, 0x00, 0x00, 0x54, 0x09, 0x04, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x88, 0xc0, 0xa8, 0x7b, 0x03,
+ 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0x9f, 0x76, 0xa9, 0x0a, 0x01, 0x00, 0xb4, 0x9c, 0x08, 0x3d,
+ 0x02, 0xa2, 0x04, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+ 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33,
+ 0x34, 0x35, 0x36, 0x37, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+
+};
+static unsigned char C10[] = {
+ 0x77, 0x3b, 0x52, 0x41, 0xa4, 0xc4, 0x49, 0x22, 0x5e, 0x4f, 0x3c, 0xe5, 0xed, 0x61, 0x1b, 0x0c,
+ 0x23, 0x7c, 0xa9, 0x6c, 0xf7, 0x4a, 0x93, 0x01, 0x3c, 0x1b, 0x0e, 0xa1, 0xa0, 0xcf, 0x70, 0xf8,
+ 0xe4, 0xec, 0xae, 0xc7, 0x8a, 0xc5, 0x3a, 0xad, 0x7a, 0x0f, 0x02, 0x2b, 0x85, 0x92, 0x43, 0xc6,
+ 0x47, 0x75, 0x2e, 0x94, 0xa8, 0x59, 0x35, 0x2b, 0x8a, 0x4d, 0x4d, 0x2d, 0xec, 0xd1, 0x36, 0xe5,
+ 0xc1, 0x77, 0xf1, 0x32, 0xad, 0x3f, 0xbf, 0xb2, 0x20, 0x1a, 0xc9, 0x90, 0x4c, 0x74, 0xee, 0x0a,
+ 0x10, 0x9e, 0x0c, 0xa1, 0xe4, 0xdf, 0xe9, 0xd5, 0xa1, 0x00, 0xb8, 0x42, 0xf1, 0xc2, 0x2f, 0x0d
+};
+
+/*
+Case #8: Sample tunnel-mode ESP packet
+ (ping -p ff -s 40 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 5
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+
+Original packet:
+IP header (20 bytes): 45000044 090c0000 4001f990 c0a87b03 c0a87bc8
+Data (48 bytes):
+0800d63c aa0a0200 c69c083d a3de0300 ffffffff ffffffff ffffffff ffffffff
+ffffffff ffffffff ffffffff ffffffff
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+ next header (80 bytes):
+45000044 090c0000 4001f990 c0a87b03 c0a87bc8 0800d63c aa0a0200 c69c083d
+a3de0300 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff
+ffffffff 01020304 05060708 090a0a04
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 090d0000 4032f926 c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000005
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+Encrypted Data (80 bytes):
+15b92683 819596a8 047232cc 00f7048f e45318e1 1f8a0f62 ede3c3fc 61203bb5
+0f980a08 c9843fd3 a1b06d5c 07ff9639 b7eb7dfb 3512e5de 435e7207 ed971ef3
+d2726d9b 5ef6affc 6d17a0de cbb13892
+ */
+static unsigned char K11[] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV11[] = {
+ 0x85, 0xd4, 0x72, 0x24, 0xb5, 0xf3, 0xdd, 0x5d, 0x21, 0x01, 0xd4, 0xea, 0x8d, 0xff, 0xab, 0x22
+};
+static unsigned char P11[] = {
+ 0x45, 0x00, 0x00, 0x44, 0x09, 0x0c, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x90, 0xc0, 0xa8, 0x7b, 0x03,
+ 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0xd6, 0x3c, 0xaa, 0x0a, 0x02, 0x00, 0xc6, 0x9c, 0x08, 0x3d,
+ 0xa3, 0xde, 0x03, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+};
+static unsigned char C11[] = {
+ 0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0x04, 0x72, 0x32, 0xcc, 0x00, 0xf7, 0x04, 0x8f,
+ 0xe4, 0x53, 0x18, 0xe1, 0x1f, 0x8a, 0x0f, 0x62, 0xed, 0xe3, 0xc3, 0xfc, 0x61, 0x20, 0x3b, 0xb5,
+ 0x0f, 0x98, 0x0a, 0x08, 0xc9, 0x84, 0x3f, 0xd3, 0xa1, 0xb0, 0x6d, 0x5c, 0x07, 0xff, 0x96, 0x39,
+ 0xb7, 0xeb, 0x7d, 0xfb, 0x35, 0x12, 0xe5, 0xde, 0x43, 0x5e, 0x72, 0x07, 0xed, 0x97, 0x1e, 0xf3,
+ 0xd2, 0x72, 0x6d, 0x9b, 0x5e, 0xf6, 0xaf, 0xfc, 0x6d, 0x17, 0xa0, 0xde, 0xcb, 0xb1, 0x38, 0x92
+};
+
+
+#define min_size(a, b) (((a)<(b))?(a):(b))
+// Plain and cypher text will be the same size
+// Those vectors using strings for plain text have an extra null terminator that needs
+// to be ignored
+#define vect_size(P, C) (min_size((sizeof(P)),(sizeof(C))))
+#define CBC_KEY_LEN(kdata) (sizeof(kdata))
+
+//field order {K, Klen, IV, Plen, P, C};
+#define vector(N) {K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N,C##N), P##N, C##N, NULL, NULL, /*NULL, NULL*/}
+struct cbc_vector const cbc_vectors[] = {
+ vector(1),
+ vector(2),
+ vector(3),
+ vector(4),
+ vector(5),
+ vector(6),
+ vector(7),
+ vector(8),
+ vector(9),
+ vector(10),
+ vector(11),
+};
+
+#endif /* AES_CBC_STD_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
new file mode 100644
index 00000000..4af56207
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
@@ -0,0 +1,443 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "ossl_helper.h"
+#include "cbc_std_vectors.h"
+
+//define CBC_VECTORS_VERBOSE
+//define CBC_VECTORS_EXTRA_VERBOSE
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN (8*1024*1024)
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+#ifndef MAX_UNALINED
+# define MAX_UNALINED (16)
+#endif
+
+static cbc_key_size const Ksize[] = { CBC_128_BITS, CBC_192_BITS, CBC_256_BITS };
+
+typedef void (*aes_cbc_generic) (uint8_t * in,
+ uint8_t * IV,
+ uint8_t * keys, uint8_t * out, uint64_t len_bytes);
+
+int OpenSslEnc(uint8_t k_len,
+ uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+ if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL128 ");
+#endif
+ openssl_aes_128_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL192 ");
+#endif
+ openssl_aes_192_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL256 ");
+ fflush(0);
+#endif
+ openssl_aes_256_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else {
+ fprintf(stderr, "Invalid key length: %d\n", k_len);
+ return 1;
+ }
+ return 0;
+}
+
+int OpenSslDec(uint8_t k_len,
+ uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+ if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL128 ");
+#endif
+ openssl_aes_128_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL192 ");
+#endif
+ openssl_aes_192_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL256 ");
+#endif
+ openssl_aes_256_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else {
+ fprintf(stderr, "Invalid key length: %d\n", k_len);
+ return 1;
+ }
+ return 0;
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+ uint64_t a;
+
+ mismatch = memcmp(test, expected, len);
+ if (!mismatch) {
+ return OK;
+
+ } else {
+ OK = 1;
+ printf(" failed %s \t\t", data_name);
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ int OK = 0;
+ aes_cbc_generic enc;
+ aes_cbc_generic dec;
+
+#ifdef CBC_VECTORS_VERBOSE
+ printf(" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+ vector->IV, vector->EXP_C, vector->KEYS);
+#endif
+ fflush(0);
+#else
+ printf(".");
+#endif
+
+ if (CBC_128_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_128;
+ dec = (aes_cbc_generic) & aes_cbc_dec_128;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC128 ");
+#endif
+ } else if (CBC_192_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_192;
+ dec = (aes_cbc_generic) & aes_cbc_dec_192;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC192 ");
+#endif
+ } else if (CBC_256_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_256;
+ dec = (aes_cbc_generic) & aes_cbc_dec_256;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC256 ");
+#endif
+ } else {
+ printf("Invalid key length: %d\n", vector->K_LEN);
+ return 1;
+ }
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->P_LEN);
+ o_ct_test = malloc(vector->P_LEN);
+ if ((pt_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+
+ aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+ ////
+ // ISA-l Encrypt
+ ////
+ enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+ if (NULL != vector->EXP_C) { //when the encrypted text is know verify correct
+ OK |=
+ check_data(vector->EXP_C, vector->C, vector->P_LEN,
+ "ISA-L expected cypher text (C)");
+ }
+ OpenSslEnc(vector->K_LEN, vector->K, vector->P, vector->IV, o_ct_test, vector->P_LEN);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->P_LEN,
+ "OpenSSL vs ISA-L cypher text (C)");
+
+ memcpy(pt_test, vector->P, vector->P_LEN);
+ memset(vector->P, 0, vector->P_LEN);
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+
+ ////
+ // ISA-l Decrypt
+ ////
+ dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->P_LEN);
+ dec(o_ct_test, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted OpenSSL (P)");
+ memset(vector->P, 0, vector->P_LEN);
+ OpenSslDec(vector->K_LEN, vector->K, vector->C, vector->IV, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "OpenSSL decrypted ISA-L (P)");
+#ifdef CBC_VECTORS_VERBOSE
+ if (OK)
+ printf("Failed");
+ else
+ printf("Passed");
+
+ printf("\n");
+#endif
+
+ return OK;
+}
+
+int test_std_combinations(void)
+{
+ int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+ int i;
+ uint8_t *iv = NULL;
+
+ printf("AES CBC standard test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+ printf("\n");
+#endif
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (NULL == iv)
+ return 1;
+
+ for (i = 0; (i < vectors_cnt); i++) {
+ struct cbc_vector vect = cbc_vectors[i];
+
+ posix_memalign((void **)&vect.KEYS, 16, (sizeof(*vect.KEYS)));
+ if (NULL == vect.KEYS)
+ return 1;
+ // IV data must be aligned to 16 byte boundary so move data in aligned buffer and change out the pointer
+ memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+ vect.IV = iv;
+ vect.C = NULL;
+ vect.C = malloc(vect.P_LEN);
+ if ((NULL == vect.C))
+ return 1;
+#ifdef CBC_VECTORS_VERBOSE
+ printf("vector[%d of %d] ", i, vectors_cnt);
+#endif
+ if (0 == (i % 25))
+ printf("\n");
+ if (0 == (i % 10))
+ fflush(0);
+
+ if (0 != check_vector(&vect))
+ return 1;
+
+ aligned_free(vect.KEYS);
+ free(vect.C);
+ }
+
+ aligned_free(iv);
+ printf("\n");
+ return 0;
+}
+
+int test_random_combinations(void)
+{
+ struct cbc_vector test;
+ int t;
+
+ printf("AES CBC random test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+ test.IV = NULL;
+ posix_memalign((void **)&test.IV, 16, (CBC_IV_DATA_LEN));
+ if (NULL == test.IV)
+ return 1;
+ test.KEYS = NULL;
+ posix_memalign((void **)&test.KEYS, 16, (sizeof(*test.KEYS)));
+ if (NULL == test.KEYS)
+ return 1;
+
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 16 + ((rand() % TEST_LEN) & ~0xf); //must be a 16byte multiple
+ int offset = (rand() % MAX_UNALINED);
+ int Kindex = (rand() % (sizeof(Ksize) / sizeof(Ksize[0]))); // select one of the valid key sizes
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+
+ test.C = NULL;
+ test.P = NULL;
+ test.K = NULL;
+ test.EXP_C = NULL;
+ test.P_LEN = Plen;
+ test.K_LEN = Ksize[Kindex];
+
+ test.P = malloc(test.P_LEN + offset);
+ test.C = malloc(test.P_LEN + offset);
+ test.K = malloc(test.K_LEN + offset);
+ if ((NULL == test.P) || (NULL == test.C) || (NULL == test.K)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return -1;
+ }
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+
+ mk_rand_data(test.P, test.P_LEN);
+ mk_rand_data(test.K, test.K_LEN);
+ mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" Offset:0x%x ", offset);
+#endif
+ if (0 != check_vector(&test))
+ return 1;
+
+ test.C -= offset;
+ free(test.C);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ }
+
+ aligned_free(test.IV);
+ aligned_free(test.KEYS);
+ printf("\n");
+ return 0;
+}
+
+int test_efence_combinations(void)
+{
+ struct cbc_vector test;
+ int offset = 0;
+ int key_idx;
+ uint8_t *P = NULL, *C = NULL, *K = NULL, *IV = NULL;
+ uint8_t *key_data = NULL;
+
+ P = malloc(PAGE_LEN);
+ C = malloc(PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ key_data = malloc(PAGE_LEN);
+
+ if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV)
+ || (NULL == key_data)
+ ) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+ // place buffers to end at page boundary
+ test.P_LEN = PAGE_LEN / 2;
+ test.EXP_C = NULL;
+
+ printf("AES CBC efence test vectors:");
+ for (key_idx = 0; key_idx < (sizeof(Ksize) / sizeof(Ksize[0])); key_idx++) {
+ test.K_LEN = Ksize[key_idx];
+
+ for (offset = 0; MAX_UNALINED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.P_LEN = ((PAGE_LEN / (1 + (2 * offset))) & ~0xff); // must be a multiple of 16
+ if (16 > test.P_LEN)
+ test.P_LEN = 16;
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.P_LEN - offset;
+ test.C = C + PAGE_LEN - test.P_LEN - offset;
+ test.K = K + PAGE_LEN - test.K_LEN - offset;
+ test.IV = IV + PAGE_LEN - CBC_IV_DATA_LEN - offset;
+ test.IV = test.IV - ((uint64_t) test.IV & 0xff); // align to 16 byte boundary
+ test.KEYS = (struct cbc_key_data *)
+ (key_data + PAGE_LEN - sizeof(*test.KEYS) - offset);
+ test.KEYS = (struct cbc_key_data *)
+ ((uint8_t *) test.KEYS - ((uint64_t) test.KEYS & 0xff)); // align to 16 byte boundary
+
+ mk_rand_data(test.P, test.P_LEN);
+ mk_rand_data(test.K, test.K_LEN);
+ mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" Offset:0x%x ", offset);
+#endif
+ if (0 != check_vector(&test))
+ return 1;
+ }
+
+ }
+
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(key_data);
+ printf("\n");
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ srand(TEST_SEED);
+ OK |= test_std_combinations();
+ OK |= test_random_combinations();
+ OK |= test_efence_combinations();
+ if (0 == OK) {
+ printf("...Pass\n");
+ } else {
+ printf("...Fail\n");
+ }
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
new file mode 100644
index 00000000..fef096e4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
@@ -0,0 +1,183 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * Run list of standard CBC test vectors through encode and decode checks.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "cbc_std_vectors.h"
+
+typedef void (*aes_cbc_generic) (uint8_t * in, uint8_t * IV, uint8_t * keys, uint8_t * out,
+ uint64_t len_bytes);
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+ uint64_t a;
+
+ mismatch = memcmp(test, expected, len);
+ if (!mismatch) {
+ return OK;
+
+ } else {
+ OK = 1;
+ printf(" failed %s \t\t", data_name);
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+ uint8_t *pt_test = NULL;
+ int OK = 0;
+ aes_cbc_generic enc;
+ aes_cbc_generic dec;
+
+ DEBUG_PRINT((" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN));
+ DEBUG_PRINT((" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+ vector->IV, vector->EXP_C, vector->KEYS));
+ printf(".");
+
+ switch (vector->K_LEN) {
+ case CBC_128_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_128;
+ dec = (aes_cbc_generic) & aes_cbc_dec_128;
+ DEBUG_PRINT((" CBC128 "));
+ break;
+ case CBC_192_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_192;
+ dec = (aes_cbc_generic) & aes_cbc_dec_192;
+ DEBUG_PRINT((" CBC192 "));
+ break;
+ case CBC_256_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_256;
+ dec = (aes_cbc_generic) & aes_cbc_dec_256;
+ DEBUG_PRINT((" CBC256 "));
+ break;
+ default:
+ printf("Invalid key length: %d\n", vector->K_LEN);
+ return 1;
+ }
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->P_LEN);
+
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+
+ aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+
+ if (NULL != vector->EXP_C) { //when the encrypted text is known verify correct
+ OK |= check_data(vector->EXP_C, vector->C, vector->P_LEN,
+ "ISA-L expected cypher text (C)");
+ }
+ memcpy(pt_test, vector->P, vector->P_LEN);
+ memset(vector->P, 0, vector->P_LEN);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+ DEBUG_PRINT((OK ? "Failed\n" : "Passed\n"));
+
+ free(pt_test);
+ return OK;
+}
+
+int test_std_combinations(void)
+{
+ int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+ int i;
+ uint8_t *iv = NULL;
+
+ printf("AES CBC standard test vectors: ");
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (NULL == iv)
+ return 1;
+
+ for (i = 0; (i < vectors_cnt); i++) {
+ struct cbc_vector vect = cbc_vectors[i];
+
+ posix_memalign((void **)&(vect.KEYS), 16, sizeof(*vect.KEYS));
+ if (NULL == vect.KEYS)
+ return 1;
+
+ // IV data must be aligned to 16 byte boundary so move data in
+ // aligned buffer and change out the pointer
+ memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+ vect.IV = iv;
+ vect.C = malloc(vect.P_LEN);
+ if (NULL == vect.C)
+ return 1;
+
+ DEBUG_PRINT(("vector[%d of %d] ", i, vectors_cnt));
+
+ if (0 != check_vector(&vect))
+ return 1;
+
+ aligned_free(vect.KEYS);
+ free(vect.C);
+ }
+
+ aligned_free(iv);
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ OK = test_std_combinations();
+
+ printf(0 == OK ? "Pass\n" : "Fail\n");
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
new file mode 100644
index 00000000..62c3e344
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
@@ -0,0 +1,1996 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ cmp r13, rax
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 9
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*10]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 9 ; do 9 rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifidn %%ENC_DEC, ENC
+
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ vaesenclast reg(j), reg(j), %%T2
+
+ %else
+
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+
+ %endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 9
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*10]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA+OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_precomp_avx_gen2
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_precomp_avx_gen2
+aesni_gcm128_precomp_avx_gen2:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_init_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_init_avx_gen2
+aesni_gcm128_init_avx_gen2:
+
+ push r12
+ push r13
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_update_avx_gen2
+aesni_gcm128_enc_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_update_avx_gen2
+aesni_gcm128_dec_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_finalize_avx_gen2
+aesni_gcm128_enc_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_finalize_avx_gen2
+aesni_gcm128_dec_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_avx_gen2
+aesni_gcm128_enc_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_avx_gen2
+aesni_gcm128_dec_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
new file mode 100644
index 00000000..c1acb4c8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
@@ -0,0 +1,1990 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ;move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 9
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*10]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 9 ; do 9 rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 9
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*10]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+
+ vpaddd xmm9, xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+ %else
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_precomp_avx_gen4
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_precomp_avx_gen4
+aesni_gcm128_precomp_avx_gen4:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, xmm2, [POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_init_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_init_avx_gen4
+aesni_gcm128_init_avx_gen4:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_update_avx_gen4
+aesni_gcm128_enc_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_update_avx_gen4
+aesni_gcm128_dec_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_finalize_avx_gen4
+aesni_gcm128_enc_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_finalize_avx_gen4
+aesni_gcm128_dec_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_avx_gen4
+aesni_gcm128_enc_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_avx_gen4
+aesni_gcm128_dec_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
new file mode 100644
index 00000000..a825b162
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
@@ -0,0 +1,2033 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqa %%T1, %%GH
+ pshufd %%T2, %%GH, 01001110b
+ pshufd %%T3, %%HK, 01001110b
+ pxor %%T2, %%GH ; %%T2 = (a1+a0)
+ pxor %%T3, %%HK ; %%T3 = (b1+b0)
+
+ pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T2, %%GH
+ pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%GH, %%T3
+ pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%GH
+ movdqa %%T3, %%GH
+ movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%GH
+ movdqa %%T4,%%GH
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T5
+ pxor %%GH, %%T2
+ pxor %%GH, %%T1 ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ movdqa %%T4, %%HK
+ pshufd %%T1, %%HK, 01001110b
+ pxor %%T1, %%HK
+ movdqu [%%GDATA + HashKey_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
+ movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
+ movdqu [%%GDATA + HashKey_3], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_3_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
+ movdqu [%%GDATA + HashKey_4], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
+ movdqu [%%GDATA + HashKey_5], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_5_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
+ movdqu [%%GDATA + HashKey_6], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
+ movdqu [%%GDATA + HashKey_7], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
+ movdqu [%%GDATA + HashKey_8], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ pxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ pinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ pinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ pxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ movdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ movdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ movdqa xmm3, xmm1
+ pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pand xmm3, xmm1
+ pshufb xmm3, [SHUF_MASK]
+ pshufb xmm3, xmm2
+ pxor %%AAD_HASH, xmm3
+
+
+ cmp r15, 0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pshufb xmm9, [SHUF_MASK]
+ pshufb xmm9, xmm2
+ pxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ pshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+
+ ; start AES for %%num_initial_blocks blocks
+ movdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa reg(i), %%CTR
+ pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ pxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 9
+movdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu %%T_key, [%%GDATA+16*10]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ pxor reg(i), %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(i), %%T1
+ %endif
+ pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ pxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ movdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM1, %%CTR
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM2, %%CTR
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM3, %%CTR
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM4, %%CTR
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM5, %%CTR
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM6, %%CTR
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM7, %%CTR
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM8, %%CTR
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu %%T_key, [%%GDATA+16*0]
+ pxor %%XMM1, %%T_key
+ pxor %%XMM2, %%T_key
+ pxor %%XMM3, %%T_key
+ pxor %%XMM4, %%T_key
+ pxor %%XMM5, %%T_key
+ pxor %%XMM6, %%T_key
+ pxor %%XMM7, %%T_key
+ pxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 9 ; do 9 rounds
+ movdqu %%T_key, [%%GDATA+16*i]
+ aesenc %%XMM1, %%T_key
+ aesenc %%XMM2, %%T_key
+ aesenc %%XMM3, %%T_key
+ aesenc %%XMM4, %%T_key
+ aesenc %%XMM5, %%T_key
+ aesenc %%XMM6, %%T_key
+ aesenc %%XMM7, %%T_key
+ aesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ movdqu %%T_key, [%%GDATA+16*i]
+ aesenclast %%XMM1, %%T_key
+ aesenclast %%XMM2, %%T_key
+ aesenclast %%XMM3, %%T_key
+ aesenclast %%XMM4, %%T_key
+ aesenclast %%XMM5, %%T_key
+ aesenclast %%XMM6, %%T_key
+ aesenclast %%XMM7, %%T_key
+ aesenclast %%XMM8, %%T_key
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ pxor %%XMM1, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM1, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ pxor %%XMM2, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM2, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ pxor %%XMM3, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM3, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ pxor %%XMM4, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM4, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ pxor %%XMM5, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM5, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ pxor %%XMM6, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM6, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ pxor %%XMM7, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM7, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ pxor %%XMM8, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ movdqa %%T7, %%XMM1
+ movdqu [rsp + TMP2], %%XMM2
+ movdqu [rsp + TMP3], %%XMM3
+ movdqu [rsp + TMP4], %%XMM4
+ movdqu [rsp + TMP5], %%XMM5
+ movdqu [rsp + TMP6], %%XMM6
+ movdqu [rsp + TMP7], %%XMM7
+ movdqu [rsp + TMP8], %%XMM8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+
+ movdqa %%T4, %%T7
+ pshufd %%T6, %%T7, 01001110b
+ pxor %%T6, %%T7
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ %endif
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ movdqa %%XMM1, %%CTR
+
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu %%T1, [%%GDATA + 16*0]
+ pxor %%XMM1, %%T1
+ pxor %%XMM2, %%T1
+ pxor %%XMM3, %%T1
+ pxor %%XMM4, %%T1
+ pxor %%XMM5, %%T1
+ pxor %%XMM6, %%T1
+ pxor %%XMM7, %%T1
+ pxor %%XMM8, %%T1
+
+ ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP2]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*1]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*2]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqu %%T1, [rsp + TMP3]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*3]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP4]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*4]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*5]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP5]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+
+ movdqu %%T1, [%%GDATA + 16*6]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+ movdqu %%T1, [rsp + TMP6]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*7]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP7]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*8]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP8]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T7, %%T3
+ pxor %%T4, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*9]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T5, [%%GDATA + 16*10]
+
+%assign i 0
+%assign j 1
+%rep 8
+ XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%T3, %%T1
+ %endif
+
+ pxor %%T1, %%T5
+ aesenclast reg(j), %%T1 ; XMM1:XMM8
+ XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(j), %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+ pxor %%T2, %%T6
+ pxor %%T2, %%T4
+ pxor %%T2, %%T7
+
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T3
+ pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
+
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T1, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T1
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T1,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T1,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T1
+
+ pxor %%T2, %%T5
+ pxor %%T7, %%T2
+ pxor %%T7, %%T4 ; the result is in %%T4
+
+
+ pxor %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ; Karatsuba Method
+ movdqa %%T6, %%XMM1
+ pshufd %%T2, %%XMM1, 01001110b
+ pxor %%T2, %%XMM1
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
+
+ pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ movdqa %%T7, %%XMM1
+ movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM2
+ pshufd %%T2, %%XMM2, 01001110b
+ pxor %%T2, %%XMM2
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM2
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM3
+ pshufd %%T2, %%XMM3, 01001110b
+ pxor %%T2, %%XMM3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM3
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM4
+ pshufd %%T2, %%XMM4, 01001110b
+ pxor %%T2, %%XMM4
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM4
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM5
+ pshufd %%T2, %%XMM5, 01001110b
+ pxor %%T2, %%XMM5
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM5
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM6
+ pshufd %%T2, %%XMM6, 01001110b
+ pxor %%T2, %%XMM6
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM6
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM7
+ pshufd %%T2, %%XMM7, 01001110b
+ pxor %%T2, %%XMM7
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM7
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM8
+ pshufd %%T2, %%XMM8, 01001110b
+ pxor %%T2, %%XMM8
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM8
+ pxor %%T2, %%XMM1
+ pxor %%T2, %%T6
+ pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+ movdqa %%T4, %%T2
+ pslldq %%T4, 8 ; shift-L %%T4 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T4
+ pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T1, %%T2
+ psrldq %%T1, 4 ; shift-R %%T1 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T4,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T1
+ pxor %%T7, %%T2
+ pxor %%T6, %%T7 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define %%GDATA %1
+%define %%ST %2
+%define %%T1 %3
+ movdqu %%T1, [%%GDATA+16*0]
+ pxor %%ST, %%T1
+%assign i 1
+%rep 9
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenc %%ST, %%T1
+%assign i (i+1)
+%endrep
+ movdqu %%T1, [%%GDATA+16*10]
+ aesenclast %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ movdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ pxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ movdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ movdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ movdqu xmm2, [r10]
+ movdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ pshufb xmm2, [SHUF_MASK]
+
+ movdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ movdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ movdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+ mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ;save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ movd r15d, xmm9
+ and r15d, 255
+ pshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ pshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ pshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ movdqu [%%GDATA + AadHash], xmm14
+ movdqu [%%GDATA + CurCount], xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ paddd xmm9, [ONE] ; INCR CNT to get Yn
+ movdqu [%%GDATA + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
+ pshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Yn)
+ movdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ movdqa xmm2, xmm1
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pand xmm2, xmm1
+ pshufb xmm2, [SHUF_MASK]
+ pxor xmm14, xmm2
+ movdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pshufb xmm9, [SHUF_MASK]
+ pxor xmm14, xmm9
+ movdqu [%%GDATA + AadHash], xmm14
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen] ; r12 = aadLen (number of bytes)
+ movdqu xmm14, [%%GDATA + AadHash]
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ movdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ movd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ movq xmm1, %%PLAIN_CYPH_LEN
+ pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ pxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Y0)
+
+ pxor xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ movq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ movq rax, xmm9
+ mov [r10], rax
+ psrldq xmm9, 8
+ movd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ movdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_precomp_sse
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_precomp_sse
+aesni_gcm128_precomp_sse:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ pxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
+
+ pshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ movdqa xmm2, xmm6
+ psllq xmm6, 1
+ psrlq xmm2, 63
+ movdqa xmm1, xmm2
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ por xmm6, xmm2
+ ;reduction
+ pshufd xmm2, xmm1, 00100100b
+ pcmpeqd xmm2, [TWOONE]
+ pand xmm2, [POLY]
+ pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_init_sse(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_init_sse
+aesni_gcm128_init_sse:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_update_sse
+aesni_gcm128_enc_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_update_sse
+aesni_gcm128_dec_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_finalize_sse
+aesni_gcm128_enc_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp + 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_finalize_sse
+aesni_gcm128_dec_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp + 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_sse
+aesni_gcm128_enc_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_sse
+aesni_gcm128_dec_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
new file mode 100644
index 00000000..2c10c916
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
@@ -0,0 +1,2036 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ;move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 13 ; encrypt N blocks with 13 key rounds
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*j] ; encrypt with last (14th) key round
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 13 ; do early (13) rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i] ; do final key round
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifidn %%ENC_DEC, ENC
+
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ vaesenclast reg(j), reg(j), %%T2
+
+ %else
+
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+
+ %endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 13 ; early key rounds (13)
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i] ; final key round (14)
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA+OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_precomp_avx_gen2
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_precomp_avx_gen2
+aesni_gcm256_precomp_avx_gen2:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_init_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_init_avx_gen2
+aesni_gcm256_init_avx_gen2:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_update_avx_gen2
+aesni_gcm256_enc_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_update_avx_gen2
+aesni_gcm256_dec_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_finalize_avx_gen2
+aesni_gcm256_enc_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_finalize_avx_gen2
+aesni_gcm256_dec_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_avx_gen2
+aesni_gcm256_enc_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_avx_gen2
+aesni_gcm256_dec_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
new file mode 100644
index 00000000..022f73fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
@@ -0,0 +1,2030 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ;move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 13 ; encrypt N blocks with 13 key rounds
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*j] ; encrypt with last (14th) key round
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 13 ; do early (13) rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 13 ; early key rounds (13)
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i] ; final key round (14)
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+
+ vpaddd xmm9, xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+ %else
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_precomp_avx_gen4
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_precomp_avx_gen4
+aesni_gcm256_precomp_avx_gen4:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, xmm2, [POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_init_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_init_avx_gen4
+aesni_gcm256_init_avx_gen4:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_update_avx_gen4
+aesni_gcm256_enc_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_update_avx_gen4
+aesni_gcm256_dec_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_finalize_avx_gen4
+aesni_gcm256_enc_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_finalize_avx_gen4
+aesni_gcm256_dec_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_avx_gen4
+aesni_gcm256_enc_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_avx_gen4
+aesni_gcm256_dec_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
new file mode 100644
index 00000000..ab49e077
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
@@ -0,0 +1,2074 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqa %%T1, %%GH
+ pshufd %%T2, %%GH, 01001110b
+ pshufd %%T3, %%HK, 01001110b
+ pxor %%T2, %%GH ; %%T2 = (a1+a0)
+ pxor %%T3, %%HK ; %%T3 = (b1+b0)
+
+ pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T2, %%GH
+ pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%GH, %%T3
+ pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%GH
+ movdqa %%T3, %%GH
+ movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%GH
+ movdqa %%T4,%%GH
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T5
+ pxor %%GH, %%T2
+ pxor %%GH, %%T1 ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ movdqa %%T4, %%HK
+ pshufd %%T1, %%HK, 01001110b
+ pxor %%T1, %%HK
+ movdqu [%%GDATA + HashKey_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
+ movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
+ movdqu [%%GDATA + HashKey_3], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_3_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
+ movdqu [%%GDATA + HashKey_4], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
+ movdqu [%%GDATA + HashKey_5], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_5_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
+ movdqu [%%GDATA + HashKey_6], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
+ movdqu [%%GDATA + HashKey_7], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
+ movdqu [%%GDATA + HashKey_8], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ pxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ pinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ pinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ pxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ movdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+ mov r13, [%%GDATA + PBlockLen]
+
+%%_data_read: ;Finished reading in data
+
+
+ movdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ movdqa xmm3, xmm1
+ pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pand xmm3, xmm1
+ pshufb xmm3, [SHUF_MASK]
+ pshufb xmm3, xmm2
+ pxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pshufb xmm9, [SHUF_MASK]
+ pshufb xmm9, xmm2
+ pxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ pshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+
+ ; start AES for %%num_initial_blocks blocks
+ movdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa reg(i), %%CTR
+ pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ pxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 13 ; encrypt N blocks with 13 key rounds
+movdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu %%T_key, [%%GDATA+16*j] ; encrypt with last (14th) key round
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ pxor reg(i), %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(i), %%T1
+ %endif
+ pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ pxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ movdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM1, %%CTR
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM2, %%CTR
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM3, %%CTR
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM4, %%CTR
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM5, %%CTR
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM6, %%CTR
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM7, %%CTR
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM8, %%CTR
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu %%T_key, [%%GDATA+16*0]
+ pxor %%XMM1, %%T_key
+ pxor %%XMM2, %%T_key
+ pxor %%XMM3, %%T_key
+ pxor %%XMM4, %%T_key
+ pxor %%XMM5, %%T_key
+ pxor %%XMM6, %%T_key
+ pxor %%XMM7, %%T_key
+ pxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 13 ; do early (13) rounds
+ movdqu %%T_key, [%%GDATA+16*i]
+ aesenc %%XMM1, %%T_key
+ aesenc %%XMM2, %%T_key
+ aesenc %%XMM3, %%T_key
+ aesenc %%XMM4, %%T_key
+ aesenc %%XMM5, %%T_key
+ aesenc %%XMM6, %%T_key
+ aesenc %%XMM7, %%T_key
+ aesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ movdqu %%T_key, [%%GDATA+16*i] ; do final key round
+ aesenclast %%XMM1, %%T_key
+ aesenclast %%XMM2, %%T_key
+ aesenclast %%XMM3, %%T_key
+ aesenclast %%XMM4, %%T_key
+ aesenclast %%XMM5, %%T_key
+ aesenclast %%XMM6, %%T_key
+ aesenclast %%XMM7, %%T_key
+ aesenclast %%XMM8, %%T_key
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ pxor %%XMM1, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM1, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ pxor %%XMM2, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM2, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ pxor %%XMM3, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM3, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ pxor %%XMM4, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM4, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ pxor %%XMM5, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM5, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ pxor %%XMM6, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM6, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ pxor %%XMM7, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM7, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ pxor %%XMM8, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ movdqa %%T7, %%XMM1
+ movdqu [rsp + TMP2], %%XMM2
+ movdqu [rsp + TMP3], %%XMM3
+ movdqu [rsp + TMP4], %%XMM4
+ movdqu [rsp + TMP5], %%XMM5
+ movdqu [rsp + TMP6], %%XMM6
+ movdqu [rsp + TMP7], %%XMM7
+ movdqu [rsp + TMP8], %%XMM8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+
+ movdqa %%T4, %%T7
+ pshufd %%T6, %%T7, 01001110b
+ pxor %%T6, %%T7
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ %endif
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ movdqa %%XMM1, %%CTR
+
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu %%T1, [%%GDATA + 16*0]
+ pxor %%XMM1, %%T1
+ pxor %%XMM2, %%T1
+ pxor %%XMM3, %%T1
+ pxor %%XMM4, %%T1
+ pxor %%XMM5, %%T1
+ pxor %%XMM6, %%T1
+ pxor %%XMM7, %%T1
+ pxor %%XMM8, %%T1
+
+ ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP2]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*1]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*2]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqu %%T1, [rsp + TMP3]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*3]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP4]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*4]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*5]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP5]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+
+ movdqu %%T1, [%%GDATA + 16*6]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+ movdqu %%T1, [rsp + TMP6]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*7]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP7]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*8]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP8]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T7, %%T3
+ pxor %%T4, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*9]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*10]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*11]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*12]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*13]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T5, [%%GDATA + 16*14] ; finish last key round
+
+
+%assign i 0
+%assign j 1
+%rep 8
+ XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%T3, %%T1
+ %endif
+
+ pxor %%T1, %%T5
+ aesenclast reg(j), %%T1 ; XMM1:XMM8
+ XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(j), %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+ pxor %%T2, %%T6
+ pxor %%T2, %%T4
+ pxor %%T2, %%T7
+
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T3
+ pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
+
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T1, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T1
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T1,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T1,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T1
+
+ pxor %%T2, %%T5
+ pxor %%T7, %%T2
+ pxor %%T7, %%T4 ; the result is in %%T4
+
+
+ pxor %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ; Karatsuba Method
+ movdqa %%T6, %%XMM1
+ pshufd %%T2, %%XMM1, 01001110b
+ pxor %%T2, %%XMM1
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
+
+ pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ movdqa %%T7, %%XMM1
+ movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM2
+ pshufd %%T2, %%XMM2, 01001110b
+ pxor %%T2, %%XMM2
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM2
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM3
+ pshufd %%T2, %%XMM3, 01001110b
+ pxor %%T2, %%XMM3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM3
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM4
+ pshufd %%T2, %%XMM4, 01001110b
+ pxor %%T2, %%XMM4
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM4
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM5
+ pshufd %%T2, %%XMM5, 01001110b
+ pxor %%T2, %%XMM5
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM5
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM6
+ pshufd %%T2, %%XMM6, 01001110b
+ pxor %%T2, %%XMM6
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM6
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM7
+ pshufd %%T2, %%XMM7, 01001110b
+ pxor %%T2, %%XMM7
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM7
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM8
+ pshufd %%T2, %%XMM8, 01001110b
+ pxor %%T2, %%XMM8
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM8
+ pxor %%T2, %%XMM1
+ pxor %%T2, %%T6
+ pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+ movdqa %%T4, %%T2
+ pslldq %%T4, 8 ; shift-L %%T4 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T4
+ pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T1, %%T2
+ psrldq %%T1, 4 ; shift-R %%T1 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T4,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T1
+ pxor %%T7, %%T2
+ pxor %%T6, %%T7 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define %%GDATA %1
+%define %%ST %2
+%define %%T1 %3
+ movdqu %%T1, [%%GDATA+16*0]
+ pxor %%ST, %%T1
+%assign i 1
+%rep 13
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenc %%ST, %%T1
+%assign i (i+1)
+%endrep
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenclast %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ movdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ pxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ movdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ movdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ movdqu xmm2, [r10]
+ movdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ pshufb xmm2, [SHUF_MASK]
+
+ movdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ movdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ movdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+ mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ;save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ movd r15d, xmm9
+ and r15d, 255
+ pshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ pshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ pshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ movdqu [%%GDATA + AadHash], xmm14
+ movdqu [%%GDATA + CurCount], xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ paddd xmm9, [ONE] ; INCR CNT to get Yn
+ movdqu [%%GDATA + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
+ pshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Yn)
+ movdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ movdqa xmm2, xmm1
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pand xmm2, xmm1
+ pshufb xmm2, [SHUF_MASK]
+ pxor xmm14, xmm2
+ movdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pshufb xmm9, [SHUF_MASK]
+ pxor xmm14, xmm9
+ movdqu [%%GDATA + AadHash], xmm14
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen] ; r12 = aadLen (number of bytes)
+ movdqu xmm14, [%%GDATA + AadHash]
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ movdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ movd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ movq xmm1, %%PLAIN_CYPH_LEN
+ pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ pxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Y0)
+
+ pxor xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ movq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ movq rax, xmm9
+ mov [r10], rax
+ psrldq xmm9, 8
+ movd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ movdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_precomp_sse
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_precomp_sse
+aesni_gcm256_precomp_sse:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ pxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
+
+ pshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ movdqa xmm2, xmm6
+ psllq xmm6, 1
+ psrlq xmm2, 63
+ movdqa xmm1, xmm2
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ por xmm6, xmm2
+ ;reduction
+ pshufd xmm2, xmm1, 00100100b
+ pcmpeqd xmm2, [TWOONE]
+ pand xmm2, [POLY]
+ pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_init_sse(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_init_sse
+aesni_gcm256_init_sse:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_update_sse
+aesni_gcm256_enc_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_update_sse
+aesni_gcm256_dec_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_finalize_sse
+aesni_gcm256_enc_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_finalize_sse
+aesni_gcm256_dec_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_sse
+aesni_gcm256_enc_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_sse
+aesni_gcm256_dec_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
new file mode 100644
index 00000000..d7df9712
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
@@ -0,0 +1,163 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+
+
+;;;;;;
+; Remove the need for different yasm commandlines on Linux vs Windows
+%ifidn __OUTPUT_FORMAT__, elf64
+%define LINUX
+%else
+%define WIN_ABI
+%endif
+
+
+section .data
+
+align 16
+
+POLY dq 0x0000000000000001, 0xC200000000000000
+POLY2 dq 0x00000001C2000000, 0xC200000000000000
+TWOONE dq 0x0000000000000001, 0x0000000100000000
+
+; order of these constants should not change.
+; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff
+ZERO dq 0x0000000000000000, 0x0000000000000000
+ONE dq 0x0000000000000001, 0x0000000000000000
+ONEf dq 0x0000000000000000, 0x0100000000000000
+
+section .text
+
+
+;;define the fields of gcm_data struct
+;typedef struct gcm_data
+;{
+; u8 expanded_keys[16*15];
+; u8 shifted_hkey_1[16]; // store HashKey <<1 mod poly here
+; u8 shifted_hkey_2[16]; // store HashKey^2 <<1 mod poly here
+; u8 shifted_hkey_3[16]; // store HashKey^3 <<1 mod poly here
+; u8 shifted_hkey_4[16]; // store HashKey^4 <<1 mod poly here
+; u8 shifted_hkey_5[16]; // store HashKey^5 <<1 mod poly here
+; u8 shifted_hkey_6[16]; // store HashKey^6 <<1 mod poly here
+; u8 shifted_hkey_7[16]; // store HashKey^7 <<1 mod poly here
+; u8 shifted_hkey_8[16]; // store HashKey^8 <<1 mod poly here
+; u8 shifted_hkey_1_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_2_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_3_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_4_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_5_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_6_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_7_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_8_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+;} gcm_data;
+
+%define HashKey 16*15 ; store HashKey <<1 mod poly here
+%define HashKey_2 16*16 ; store HashKey^2 <<1 mod poly here
+%define HashKey_3 16*17 ; store HashKey^3 <<1 mod poly here
+%define HashKey_4 16*18 ; store HashKey^4 <<1 mod poly here
+%define HashKey_5 16*19 ; store HashKey^5 <<1 mod poly here
+%define HashKey_6 16*20 ; store HashKey^6 <<1 mod poly here
+%define HashKey_7 16*21 ; store HashKey^7 <<1 mod poly here
+%define HashKey_8 16*22 ; store HashKey^8 <<1 mod poly here
+%define HashKey_k 16*23 ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_2_k 16*24 ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_3_k 16*25 ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_4_k 16*26 ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_5_k 16*27 ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_6_k 16*28 ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_7_k 16*29 ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_8_k 16*30 ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+%define AadHash 16*31 ; store current Hash of data which has been input
+%define AadLen 16*32 ; store length of input data which will not be encrypted or decrypted
+%define InLen 16*32+8 ; store length of input data which will be encrypted or decrypted
+%define PBlockEncKey 16*33 ; encryption key for the partial block at the end of the previous update
+%define OrigIV 16*34 ; input IV
+%define CurCount 16*35 ; Current counter for generation of encryption key
+%define PBlockLen 16*36 ; length of partial block at the end of the previous update
+
+%define reg(q) xmm %+ q
+
+
+
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+ %xdefine arg4 r9
+ %xdefine arg5 [r14 + STACK_OFFSET + 8*5]
+ %xdefine arg6 [r14 + STACK_OFFSET + 8*6]
+ %xdefine arg7 [r14 + STACK_OFFSET + 8*7]
+ %xdefine arg8 [r14 + STACK_OFFSET + 8*8]
+ %xdefine arg9 [r14 + STACK_OFFSET + 8*9]
+
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+ %xdefine arg4 rcx
+ %xdefine arg5 r8
+ %xdefine arg6 r9
+ %xdefine arg7 [r14 + STACK_OFFSET + 8*1]
+ %xdefine arg8 [r14 + STACK_OFFSET + 8*2]
+ %xdefine arg9 [r14 + STACK_OFFSET + 8*3]
+
+%endif
+
+%ifdef NT_LDST
+ %define NT_LD
+ %define NT_ST
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_LD
+ %define XLDR movntdqa
+ %define VXLDR vmovntdqa
+%else
+ %define XLDR movdqu
+ %define VXLDR vmovdqu
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_ST
+ %define XSTR movntdq
+ %define VXSTR vmovntdq
+%else
+ %define XSTR movdqu
+ %define VXSTR vmovdqu
+%endif
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
new file mode 100644
index 00000000..e46066ac
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
@@ -0,0 +1,172 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern aesni_gcm128_init_sse
+extern aesni_gcm128_init_avx_gen4
+extern aesni_gcm128_init_avx_gen2
+
+extern aesni_gcm128_enc_sse
+extern aesni_gcm128_enc_avx_gen4
+extern aesni_gcm128_enc_avx_gen2
+extern aesni_gcm128_enc_update_sse
+extern aesni_gcm128_enc_update_avx_gen4
+extern aesni_gcm128_enc_update_avx_gen2
+extern aesni_gcm128_enc_finalize_sse
+extern aesni_gcm128_enc_finalize_avx_gen4
+extern aesni_gcm128_enc_finalize_avx_gen2
+
+extern aesni_gcm128_dec_sse
+extern aesni_gcm128_dec_avx_gen4
+extern aesni_gcm128_dec_avx_gen2
+extern aesni_gcm128_dec_update_sse
+extern aesni_gcm128_dec_update_avx_gen4
+extern aesni_gcm128_dec_update_avx_gen2
+extern aesni_gcm128_dec_finalize_sse
+extern aesni_gcm128_dec_finalize_avx_gen4
+extern aesni_gcm128_dec_finalize_avx_gen2
+
+extern aesni_gcm128_precomp_sse
+extern aesni_gcm128_precomp_avx_gen4
+extern aesni_gcm128_precomp_avx_gen2
+
+
+
+extern aesni_gcm256_init_sse
+extern aesni_gcm256_init_avx_gen4
+extern aesni_gcm256_init_avx_gen2
+
+extern aesni_gcm256_enc_sse
+extern aesni_gcm256_enc_avx_gen4
+extern aesni_gcm256_enc_avx_gen2
+extern aesni_gcm256_enc_update_sse
+extern aesni_gcm256_enc_update_avx_gen4
+extern aesni_gcm256_enc_update_avx_gen2
+extern aesni_gcm256_enc_finalize_sse
+extern aesni_gcm256_enc_finalize_avx_gen4
+extern aesni_gcm256_enc_finalize_avx_gen2
+
+extern aesni_gcm256_dec_sse
+extern aesni_gcm256_dec_avx_gen4
+extern aesni_gcm256_dec_avx_gen2
+extern aesni_gcm256_dec_update_sse
+extern aesni_gcm256_dec_update_avx_gen4
+extern aesni_gcm256_dec_update_avx_gen2
+extern aesni_gcm256_dec_finalize_sse
+extern aesni_gcm256_dec_finalize_avx_gen4
+extern aesni_gcm256_dec_finalize_avx_gen2
+
+extern aesni_gcm256_precomp_sse
+extern aesni_gcm256_precomp_avx_gen4
+extern aesni_gcm256_precomp_avx_gen2
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aesni_gcm128_init
+mbin_dispatch_init aesni_gcm128_init, aesni_gcm128_init_sse, aesni_gcm128_init_avx_gen2, aesni_gcm128_init_avx_gen4
+
+mbin_interface aesni_gcm128_enc
+mbin_dispatch_init aesni_gcm128_enc, aesni_gcm128_enc_sse, aesni_gcm128_enc_avx_gen2, aesni_gcm128_enc_avx_gen4
+
+mbin_interface aesni_gcm128_enc_update
+mbin_dispatch_init aesni_gcm128_enc_update, aesni_gcm128_enc_update_sse, aesni_gcm128_enc_update_avx_gen2, aesni_gcm128_enc_update_avx_gen4
+
+mbin_interface aesni_gcm128_enc_finalize
+mbin_dispatch_init aesni_gcm128_enc_finalize, aesni_gcm128_enc_finalize_sse, aesni_gcm128_enc_finalize_avx_gen2, aesni_gcm128_enc_finalize_avx_gen4
+
+mbin_interface aesni_gcm128_dec
+mbin_dispatch_init aesni_gcm128_dec, aesni_gcm128_dec_sse, aesni_gcm128_dec_avx_gen2, aesni_gcm128_dec_avx_gen4
+
+mbin_interface aesni_gcm128_dec_update
+mbin_dispatch_init aesni_gcm128_dec_update, aesni_gcm128_dec_update_sse, aesni_gcm128_dec_update_avx_gen2, aesni_gcm128_dec_update_avx_gen4
+
+mbin_interface aesni_gcm128_dec_finalize
+mbin_dispatch_init aesni_gcm128_dec_finalize, aesni_gcm128_dec_finalize_sse, aesni_gcm128_dec_finalize_avx_gen2, aesni_gcm128_dec_finalize_avx_gen4
+
+mbin_interface aesni_gcm128_precomp
+mbin_dispatch_init aesni_gcm128_precomp, aesni_gcm128_precomp_sse, aesni_gcm128_precomp_avx_gen2, aesni_gcm128_precomp_avx_gen4
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aesni_gcm256_init
+mbin_dispatch_init aesni_gcm256_init, aesni_gcm256_init_sse, aesni_gcm256_init_avx_gen2, aesni_gcm256_init_avx_gen4
+
+mbin_interface aesni_gcm256_enc
+mbin_dispatch_init aesni_gcm256_enc, aesni_gcm256_enc_sse, aesni_gcm256_enc_avx_gen2, aesni_gcm256_enc_avx_gen4
+
+mbin_interface aesni_gcm256_enc_update
+mbin_dispatch_init aesni_gcm256_enc_update, aesni_gcm256_enc_update_sse, aesni_gcm256_enc_update_avx_gen2, aesni_gcm256_enc_update_avx_gen4
+
+mbin_interface aesni_gcm256_enc_finalize
+mbin_dispatch_init aesni_gcm256_enc_finalize, aesni_gcm256_enc_finalize_sse, aesni_gcm256_enc_finalize_avx_gen2, aesni_gcm256_enc_finalize_avx_gen4
+
+mbin_interface aesni_gcm256_dec
+mbin_dispatch_init aesni_gcm256_dec, aesni_gcm256_dec_sse, aesni_gcm256_dec_avx_gen2, aesni_gcm256_dec_avx_gen4
+
+mbin_interface aesni_gcm256_dec_update
+mbin_dispatch_init aesni_gcm256_dec_update, aesni_gcm256_dec_update_sse, aesni_gcm256_dec_update_avx_gen2, aesni_gcm256_dec_update_avx_gen4
+
+mbin_interface aesni_gcm256_dec_finalize
+mbin_dispatch_init aesni_gcm256_dec_finalize, aesni_gcm256_dec_finalize_sse, aesni_gcm256_dec_finalize_avx_gen2, aesni_gcm256_dec_finalize_avx_gen4
+
+mbin_interface aesni_gcm256_precomp
+mbin_dispatch_init aesni_gcm256_precomp, aesni_gcm256_precomp_sse, aesni_gcm256_precomp_avx_gen2, aesni_gcm256_precomp_avx_gen4
+
+
+;;; func core, ver, snum
+slversion aesni_gcm128_enc, 00, 00, 0280
+slversion aesni_gcm128_dec, 00, 00, 0281
+slversion aesni_gcm128_init, 00, 00, 0282
+slversion aesni_gcm128_enc_update, 00, 00, 0283
+slversion aesni_gcm128_dec_update, 00, 00, 0284
+slversion aesni_gcm128_enc_finalize, 00, 00, 0285
+slversion aesni_gcm128_dec_finalize, 00, 00, 0286
+slversion aesni_gcm256_enc, 00, 00, 0288
+slversion aesni_gcm256_dec, 00, 00, 0289
+slversion aesni_gcm256_init, 00, 00, 028a
+slversion aesni_gcm256_enc_update, 00, 00, 028b
+slversion aesni_gcm256_dec_update, 00, 00, 028c
+slversion aesni_gcm256_enc_finalize, 00, 00, 028d
+slversion aesni_gcm256_dec_finalize, 00, 00, 028e
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
new file mode 100644
index 00000000..a8e9f39e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <test.h>
+#include "ossl_helper.h"
+#include "gcm_vectors.h"
+
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define AAD_LENGTH 16
+#define TEST_MEM TEST_LEN
+
+static unsigned char *plaintext, *gcm_plaintext, *cyphertext, *ossl_plaintext,
+ *ossl_cyphertext, *gcm_tag, *ossl_tag, *IV, *AAD;
+static uint8_t key128[GCM_128_KEY_LEN];
+static uint8_t key256[GCM_256_KEY_LEN];
+uint8_t iv_len = 0;
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, int vect, char *data_name)
+{
+ int mismatch;
+ int OK = 1;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 0;
+ printf(" v[%d] expected results don't match %s \t\t", vect, data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+void aes_gcm_perf(void)
+{
+ struct gcm_data gdata;
+ struct gcm_data gdata256;
+ int i;
+
+ printf
+ ("AES GCM performace parameters plain text length:%d; IV length:%d; ADD length:%d \n",
+ TEST_LEN, GCM_IV_LEN, AAD_LENGTH);
+
+ mk_rand_data(key128, sizeof(key128));
+ mk_rand_data(key256, sizeof(key256));
+
+ // This is only required once for a given key
+ aesni_gcm128_pre(key128, &gdata);
+ aesni_gcm256_pre(key256, &gdata256);
+
+ // Preload code cache
+ aesni_gcm128_enc(&gdata, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+ gcm_tag, MAX_TAG_LEN);
+ openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+ plaintext, TEST_LEN, ossl_cyphertext);
+ check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+ "ISA-L vs OpenSSL 128 key cypher text (C)");
+ check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 128 tag (T)");
+ aesni_gcm256_enc(&gdata256, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+ gcm_tag, MAX_TAG_LEN);
+ openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+ plaintext, TEST_LEN, ossl_cyphertext);
+ check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+ "ISA-L vs OpenSSL 256 cypher text (C)");
+ check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 256 tag (T)");
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm128_enc(&gdata, cyphertext, plaintext, TEST_LEN, IV, AAD,
+ AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+ cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm128_dec(&gdata, plaintext, cyphertext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, "ISA-L check of tag (T)");
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_gcm_dec(key128, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+ plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ printf("\n");
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm256_enc(&gdata256, cyphertext, plaintext, TEST_LEN, IV, AAD,
+ AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm256_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+ cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm256_dec(&gdata256, plaintext, cyphertext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0,
+ "ISA-L check of 256 tag (T)");
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm256_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_gcm_dec(key256, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+ plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+}
+
+int main(void)
+{
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint32_t OK = 1;
+
+ plaintext = malloc(TEST_LEN);
+ gcm_plaintext = malloc(TEST_LEN);
+ cyphertext = malloc(TEST_LEN);
+ ossl_plaintext = malloc(TEST_LEN + 16);
+ ossl_cyphertext = malloc(TEST_LEN);
+ gcm_tag = malloc(MAX_TAG_LEN);
+ ossl_tag = malloc(MAX_TAG_LEN);
+ AAD = malloc(AAD_LENGTH);
+ IV = malloc(GCM_IV_LEN);
+ if ((NULL == plaintext) || (NULL == cyphertext) || (NULL == gcm_plaintext)
+ || (NULL == ossl_plaintext) || (NULL == ossl_cyphertext)
+ || (NULL == gcm_tag) || (NULL == ossl_tag) || (NULL == AAD) || (NULL == IV)) {
+ printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+ return -1;
+ }
+
+ mk_rand_data(plaintext, TEST_LEN);
+ mk_rand_data(AAD, AAD_LENGTH);
+ mk_rand_data(IV, GCM_IV_LEN);
+ memcpy(&IV[GCM_IV_END_START], IVend, sizeof(IVend));
+ iv_len = GCM_IV_LEN - sizeof(IVend); //end marker not part of IV length
+
+ aes_gcm_perf();
+ printf("AES gcm ISA-L vs OpenSSL performance\n");
+
+ return !OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
new file mode 100644
index 00000000..7f0bbcba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
@@ -0,0 +1,71 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_gcm.h>
+#include <aes_keyexp.h>
+
+void aes_keyexp_128_enc(uint8_t *, uint8_t *);
+void aesni_gcm128_precomp(struct gcm_data *my_ctx_data);
+void aesni_gcm256_precomp(struct gcm_data *my_ctx_data);
+
+void aesni_gcm128_pre(uint8_t * key, struct gcm_data *gdata)
+{
+ //////
+ // Prefill the key values for each round of encrypting/decrypting
+ // Prefill the Sub Hash key values for encoding the tag
+ //////
+ aes_keyexp_128_enc(key, (uint8_t *) gdata->expanded_keys);
+ aesni_gcm128_precomp(gdata);
+
+}
+
+void aesni_gcm256_pre(uint8_t * key, struct gcm_data *gdata)
+{
+ struct gcm_data tmp;
+ //////
+ // Prefill the key values for each round of encrypting/decrypting
+ // Prefill the Sub Hash key values for encoding the tag
+ //////
+ aes_keyexp_256(key, gdata->expanded_keys, tmp.expanded_keys);
+ aesni_gcm256_precomp(gdata);
+
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver aesni_gcm128_pre_slver_00000287;
+struct slver aesni_gcm128_pre_slver = { 0x0287, 0x00, 0x00 };
+
+struct slver aesni_gcm256_pre_slver_0000028f;
+struct slver aesni_gcm256_pre_slver = { 0x028f, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
new file mode 100644
index 00000000..e4b5b92f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
@@ -0,0 +1,1937 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <openssl/sha.h>
+#include "gcm_vectors.h"
+#include "ossl_helper.h"
+#include "types.h"
+
+//define GCM_VECTORS_VERBOSE
+//define GCM_VECTORS_EXTRA_VERBOSE
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN 32*1024
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+
+#if defined(NT_LD) || defined(NT_ST) || defined(NT_LDST)
+# define ALIGNMENT_MASK (~15)
+# define OFFSET_BASE_VALUE 16
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (1)
+#endif
+#else
+# define ALIGNMENT_MASK (~0)
+# define OFFSET_BASE_VALUE 1
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (16)
+#endif
+#endif
+
+
+void dump_table(char *title, uint8_t * table, uint8_t count)
+{
+ int i;
+ char const *space = " ";
+
+ printf("%s%s => {\n", space, title);
+ for (i = 0; i < count; i++) {
+ if (0 == (i & 15))
+ printf("%s%s", space, space);
+ printf("%2x, ", table[i]);
+ if (15 == (i & 15))
+ printf("\n");
+
+ }
+ printf("%s}\n", space);
+}
+
+void dump_gcm_data(struct gcm_data *gdata)
+{
+#ifdef GCM_VECTORS_EXTRA_VERBOSE
+ printf("gcm_data {\n");
+ dump_table("expanded_keys", gdata->expanded_keys, (16 * 11));
+ dump_table("shifted_hkey_1", gdata->shifted_hkey_1, 16);
+ dump_table("shifted_hkey_2", gdata->shifted_hkey_2, 16);
+ dump_table("shifted_hkey_3", gdata->shifted_hkey_3, 16);
+ dump_table("shifted_hkey_4", gdata->shifted_hkey_4, 16);
+ dump_table("shifted_hkey_5", gdata->shifted_hkey_5, 16);
+ dump_table("shifted_hkey_6", gdata->shifted_hkey_6, 16);
+ dump_table("shifted_hkey_7", gdata->shifted_hkey_7, 16);
+ dump_table("shifted_hkey_8", gdata->shifted_hkey_8, 16);
+ dump_table("shifted_hkey_1_k", gdata->shifted_hkey_1_k, 16);
+ dump_table("shifted_hkey_2_k", gdata->shifted_hkey_2_k, 16);
+ dump_table("shifted_hkey_3_k", gdata->shifted_hkey_3_k, 16);
+ dump_table("shifted_hkey_4_k", gdata->shifted_hkey_4_k, 16);
+ dump_table("shifted_hkey_5_k", gdata->shifted_hkey_5_k, 16);
+ dump_table("shifted_hkey_6_k", gdata->shifted_hkey_6_k, 16);
+ dump_table("shifted_hkey_7_k", gdata->shifted_hkey_7_k, 16);
+ dump_table("shifted_hkey_8_k", gdata->shifted_hkey_8_k, 16);
+ printf("}\n");
+#endif //GCM_VECTORS_VERBOSE
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct gcm_data *gdata, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(gdata, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm128_dec(gdata, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int check_strm_vector(struct gcm_data *gdata, gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gdata->in_length != vector->Plen)
+ printf("%lu, %lu\n", gdata->in_length, vector->Plen);
+ aesni_gcm128_enc_finalize(gdata, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (rand() % (test_len / 64) == 0) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_dec_finalize(gdata, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector2(struct gcm_data *gdata, gcm_vector * vector, int length, int start,
+ int breaks)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = length;
+ uint8_t *rand_data = NULL;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(gdata, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, stream, i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_enc_finalize(gdata, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = length;
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, stream, i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_dec_finalize(gdata, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector_efence(struct gcm_data *gdata, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = 1;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ stream = malloc(PAGE_LEN);
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break,
+ i - last_break);
+ aesni_gcm128_enc_update(gdata, vector->C + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_enc_finalize(gdata, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ stream = malloc(PAGE_LEN);
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break,
+ i - last_break);
+ aesni_gcm128_dec_update(gdata, vector->P + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_dec_finalize(gdata, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_256_vector(struct gcm_data *gdata, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm256_enc(gdata, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm256_dec(gdata, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm256_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int check_256_strm_vector(struct gcm_data *gdata, gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm256_init(gdata, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+
+ aesni_gcm256_enc_update(gdata, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aesni_gcm256_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gdata->in_length != vector->Plen)
+ printf("%lu, %lu\n", gdata->in_length, vector->Plen);
+ aesni_gcm256_enc_finalize(gdata, vector->T, vector->Tlen);
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+ aesni_gcm256_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+
+ aesni_gcm256_dec_update(gdata, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+
+ }
+ aesni_gcm256_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm256_dec_finalize(gdata, vector->T, vector->Tlen);
+
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm256_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int test_gcm_strm_efence(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random efence test vectors with random stream:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector_efence(gdata, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int test_gcm_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gdatatemp = NULL;
+ struct gcm_data *gdata = NULL;
+
+ gdatatemp = malloc(sizeof(struct gcm_data) + 16);
+ gdata = (struct gcm_data *)(gdatatemp + rand() % 16);
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 0; // (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector(gdata, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdatatemp);
+ return 0;
+}
+
+int test_gcm_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_vector(gdata, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int test_gcm256_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_vector(gdata, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int test_gcm256_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gdatatemp = NULL;
+ struct gcm_data *gdata = NULL;
+
+ gdatatemp = malloc(sizeof(struct gcm_data) + 16);
+ gdata = (struct gcm_data *)(gdatatemp + rand() % 16);
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_strm_vector(gdata, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdatatemp);
+ return 0;
+}
+
+//
+// place all data to end at a page boundary to check for read past the end
+//
+int test_gcm_efence(void)
+{
+ gcm_vector test;
+ int offset = 0;
+ gcm_key_size key_len;
+ struct gcm_data *gdata = NULL;
+ uint8_t *P, *C, *K, *IV, *A, *T;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ P = malloc(PAGE_LEN);
+ C = malloc(PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ A = malloc(PAGE_LEN);
+ T = malloc(PAGE_LEN);
+ if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) || (NULL == A)
+ || (NULL == T) || (NULL == gdata)) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+
+ test.Plen = PAGE_LEN / 2;
+ // place buffers to end at page boundary
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.Alen = test.Plen;
+ test.Tlen = MAX_TAG_LEN;
+
+ printf("AES GCM efence test vectors:");
+ for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len;
+ key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) {
+ test.Klen = key_len;
+ for (offset = 0; MAX_UNALIGNED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.Plen = (PAGE_LEN / 2) - offset;
+ test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.Plen;
+ test.C = C + PAGE_LEN - test.Plen;
+ test.K = K + PAGE_LEN - test.Klen;
+ test.IV = IV + PAGE_LEN - test.IVlen;
+ test.A = A + PAGE_LEN - test.Alen;
+ test.T = T + PAGE_LEN - test.Tlen;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+ if (GCM_128_KEY_LEN == key_len) {
+ if (0 != check_vector(gdata, &test))
+ return 1;
+ } else {
+ if (0 != check_256_vector(gdata, &test))
+ return 1;
+ }
+ }
+ }
+ free(gdata);
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(A);
+ free(T);
+
+ printf("\n");
+ return 0;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-128:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, &gdata);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gdata);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm128_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm128_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm128_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm128_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm128_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-256:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, &gdata);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gdata);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)");
+ OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)");
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm256_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm256_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm256_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm256_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm256_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors:\n");
+ for (vect = 0; ((vect < vectors_cnt) /*&& (1 == OK) */ ); vect++) {
+#ifdef GCM_VECTORS_VERBOSE
+ printf
+ ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+
+ if (BITS_128 == gcm_vectors[vect].Klen) {
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ } else {
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ }
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+// The length of the data is set to length. The first stream is from 0 to start. After
+// that the data is broken into breaks chunks of equal size (except possibly the last
+// one due to divisibility).
+int test_gcm_strm_combinations2(int length, int start, int breaks)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random test vectors of length %d and stream with %d breaks:", length,
+ breaks + 1);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = length;
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector2(gdata, &test, length, start, breaks))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors += test_gcm_std_vectors();
+ errors += test_gcm256_combinations();
+ errors += test_gcm_combinations();
+ errors += test_gcm_efence();
+ errors += test_gcm256_strm_combinations(TEST_LEN);
+ errors += test_gcm_strm_combinations(TEST_LEN);
+ errors += test_gcm256_strm_combinations(1024);
+ errors += test_gcm_strm_combinations(1024);
+ errors += test_gcm_strm_efence();
+ errors += test_gcm_strm_combinations2(1024, 0, 1024);
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
new file mode 100644
index 00000000..c5c6367b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
@@ -0,0 +1,322 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include "gcm_vectors.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, &gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm128_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm128_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm128_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm128_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ memset(pt_test, 0, vector->Plen);
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, &gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm256_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm256_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm256_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm256_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors:\n");
+ for (vect = 0; ((vect < vectors_cnt) /*&& (1 == OK) */ ); vect++) {
+#ifdef DEBUG
+ printf
+ ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+
+ if (BITS_128 == gcm_vectors[vect].Klen) {
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ } else {
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ }
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors = test_gcm_std_vectors();
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
new file mode 100644
index 00000000..c8bb34a5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
@@ -0,0 +1,476 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_GCM_VECTORS_H_
+#define AES_GCM_VECTORS_H_
+
+#include <stdint.h>
+
+typedef enum gcm_key_size { BITS_128 = 16, BITS_256 = 32 } gcm_key_size;
+#define KBITS(K) (sizeof(K))
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+typedef struct gcm_vector {
+ uint8_t* K; // AES Key
+ gcm_key_size Klen; // length of key in bits
+ uint8_t* IV; // initial value used by GCM
+ uint64_t IVlen; // length of IV in bytes
+ uint8_t* A; // additional authenticated data
+ uint64_t Alen; // length of AAD in bytes
+ uint8_t* P; // Plain text
+ uint64_t Plen; // length of our plaintext
+ //outputs of encryption
+ uint8_t* C; // same length as PT
+ uint8_t* T; // Authenication tag
+ uint8_t Tlen; // AT length can be 0 to 128bits
+} gcm_vector;
+
+///////
+// 60-Byte Packet Encryption Using GCM-AES-128
+// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K: AD7A2BD03EAC835A6F620FDCB506B345
+// IV: 12153524C0895E81B2C28465
+// AAD: D609B1F056637A0D46DF998D88E52E00
+// B2C2846512153524C0895E81
+// P: 08000F101112131415161718191A1B1C
+// 1D1E1F202122232425262728292A2B2C
+// 2D2E2F303132333435363738393A0002
+// C: 701AFA1CC039C0D765128A665DAB6924
+// 3899BF7318CCDC81C9931DA17FBE8EDD
+// 7D17CB8B4C26FC81E3284F2B7FBA713D
+// AT: 4F8D55E7D3F06FD5A13C0C29B9D5B880
+// H: 73A23D80121DE2D5A850253FCF43120E
+///////
+static uint8_t K1[] = {0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45};
+static uint8_t P1[] = {
+ 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+ , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+ , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02
+};
+static uint8_t IV1[] = {0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65};
+static uint8_t A1[] = {
+ 0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00
+ , 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81
+};
+#define A1_len sizeof(A1)
+static uint8_t C1[] = {
+ 0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24
+ , 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD
+ , 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D
+};
+static uint8_t T1[] = {
+ 0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80
+};
+
+
+///////
+// 54-Byte Packet Encryption Using GCM-AES-128
+// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K: 071B113B0CA743FECCCF3D051F737382
+// IV: F0761E8DCD3D000176D457ED
+// AAD: E20106D7CD0DF0761E8DCD3D88E54C2A
+// 76D457ED
+// P: 08000F101112131415161718191A1B1C
+// 1D1E1F202122232425262728292A2B2C
+// 2D2E2F30313233340004
+// C: 13B4C72B389DC5018E72A171DD85A5D3
+// 752274D3A019FBCAED09A425CD9B2E1C
+// 9B72EEE7C9DE7D52B3F3
+// AT: D6A5284F4A6D3FE22A5D6C2B960494C3
+// H: E4E01725D724C1215C7309AD34539257
+///////
+static uint8_t K2[] = {0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82};
+static uint8_t P2[] = {
+ 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+ , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+ , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04
+};
+static uint8_t IV2[] = {0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED};
+//static uint8_t IV1p[] = {0, 0, 0, 1};
+static uint8_t A2[] = {
+ 0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A
+ , 0x76, 0xD4, 0x57, 0xED
+};
+#define A2_len sizeof(A2)
+static uint8_t C2[] = {
+ 0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3
+ , 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C
+ , 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3
+};
+static uint8_t T2[] = {
+ 0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3
+};
+
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 128]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K: c939cc13397c1d37de6ae0e1cb7c423c
+// IV: b3d8cc017cbb89b39e0f67e2
+// P: c3b3c41f113a31b73d9a5cd432103069
+// AAD: 24825602bd12a984e0092d3e448eda5f
+// C: 93fe7d9e9bfd10348a5606e5cafa7354
+// AT: 0032a1dc85f1c9786925a2e71d8272dd
+///////
+static uint8_t K3[] = {0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c};
+static uint8_t IV3[] = {0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2};
+static uint8_t P3[] = {0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69};
+static uint8_t A3[] = {0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f};
+#define A3_len sizeof(A3)
+static uint8_t C3[] = {0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54};
+static uint8_t T3[] = {0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K4[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV4[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P4[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A4[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A4_len sizeof(A4)
+static uint8_t C4[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T4[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K5[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV5[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P5[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A5[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A5_len sizeof(A5)
+static uint8_t C5[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T5[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 2
+// K: 00000000000000000000000000000000
+// P: 00000000000000000000000000000000
+// IV: 000000000000000000000000
+// C: 0388dace60b6a392f328c2b971b2fe78
+// T: ab6e47d42cec13bdf53a67b21257bddf
+// H: 66e94bd4ef8a2c3b884cfa59ca342b2e
+///////
+static uint8_t K6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t P6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t IV6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t A6[] = {0};
+#define A6_len 0
+static uint8_t C6[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78};
+static uint8_t T6[] = {0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 3
+// K: feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b391aafd255
+// IV: cafebabefacedbaddecaf888
+// H: b83b533708bf535d0aa6e52980d53b78
+// C: 42831ec2217774244b7221b784d0d49c
+// e3aa212f2c02a4e035c17e2329aca12e
+// 21d514b25466931c7d8f6a5aac84aa05
+// 1ba30b396a0aac973d58e091473f5985
+// T: 4d5c2af327cd64a62cf35abd2ba6fab4
+///////
+static uint8_t K7[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P7[] = {0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+ , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+ , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+ , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55};
+static uint8_t IV7[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t A7[] = {0};
+#define A7_len 0
+static uint8_t C7[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+ , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+ , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+ , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T7[] = {0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 4
+// K: feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbaddecaf888
+// H: b83b533708bf535d0aa6e52980d53b78
+// C: 42831ec2217774244b7221b784d0d49c
+// e3aa212f2c02a4e035c17e2329aca12e
+// 21d514b25466931c7d8f6a5aac84aa05
+// 1ba30b396a0aac973d58e091
+// T: 5bc94fbc3221a5db94fae95ae7121a47
+///////
+static uint8_t K8[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P8[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+ , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+ , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+ , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39};
+static uint8_t A8[] = {0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef
+ , 0xab, 0xad, 0xda, 0xd2};
+#define A8_len sizeof(A8)
+static uint8_t IV8[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C8[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+ , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+ , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+ , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T8[] = {0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 14
+// K: 00000000000000000000000000000000
+// 00000000000000000000000000000000
+// P: 00000000000000000000000000000000
+// A:
+// IV: 000000000000000000000000
+// H: dc95c078a2408989ad48a21492842087
+// C: cea7403d4d606b6e074ec5d3baf39d18
+// T: d0d1c8a799996bf0265b98b5d48ab919
+///////
+static uint8_t K9[] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t P9[] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+};
+static uint8_t A9[] = {0};
+#define A9_len 0
+static uint8_t IV9[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t C9[] = {
+ 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+static uint8_t T9[] = {0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 15
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b391aafd255
+// A:
+// IV: cafebabefacedbaddecaf888
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 522dc1f099567d07f47f37a32a84427d
+// 643a8cdcbfe5c0c97598a2bd2555d1aa
+// 8cb08e48590dbb3da7b08b1056828838
+// c5f61e6393ba7a0abcc9f662898015ad
+// T: b094dac5d93471bdec1a502270e3cc6c
+///////
+static uint8_t K10[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P10[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+static uint8_t A10[] = {0};
+#define A10_len 0
+static uint8_t IV10[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C10[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+static uint8_t T10[] = {
+ 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 16
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbaddecaf888
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 522dc1f099567d07f47f37a32a84427d
+// 643a8cdcbfe5c0c97598a2bd2555d1aa
+// 8cb08e48590dbb3da7b08b1056828838
+// c5f61e6393ba7a0abcc9f662
+// T: 76fc6ece0f4e1768cddf8853bb2d551b
+///////
+static uint8_t K11[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P11[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+};
+static uint8_t A11[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2};
+#define A11_len sizeof(A11)
+static uint8_t IV11[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C11[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62
+};
+static uint8_t T11[] = {0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 17 -- Not supported IV length less than 12 bytes
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbad
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: c3762df1ca787d32ae47c13bf19844cb
+// af1ae14d0b976afac52ff7d79bba9de0
+// feb582d33934a4f0954cc2363bc73f78
+// 62ac430e64abe499f47c9b1f
+// T: 3a337dbf46a792c45e454913fe2ea8f2
+///////
+//static uint8_t K12[] = {
+// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+//static uint8_t P12[] = {
+// 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+// 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+// 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+// 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+//};
+//static uint8_t A12[] = {
+// 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+// 0xab, 0xad, 0xda, 0xd2};
+//static uint8_t IV12[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad};
+//static uint8_t H12[] = {
+// 0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7};
+//static uint8_t C12[] = {
+// 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+// 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+// 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+// 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, 0xf4, 0x7c, 0x9b, 0x1f
+//};
+//static uint8_t T12[] = {
+// 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 18 -- Not supported IV length greater than 12 bytes
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: 9313225df88406e555909c5aff5269aa
+// 6a7a9538534f7da1e4c303d2a318a728
+// c3c0c95156809539fcf0e2429a6b5254
+// 16aedbf5a0de6a57a637b39b
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 5a8def2f0c9e53f1f75d7853659e2a20
+// eeb2b22aafde6419a058ab4f6f746bf4
+// 0fc0c3b780f244452da3ebf1c5d82cde
+// a2418997200ef82e44ae7e3f
+// T: a44a8266ee1c8eb0c8b5d4cf5ae9f19a
+///////
+
+
+#define vector(N) {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)}
+
+gcm_vector const gcm_vectors[] = {
+ //field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen};
+ // original vector does not have a valid sub hash key
+ vector(1),
+ vector(2),
+ vector(3),
+ vector(4),
+ vector(5),
+ vector(6),
+ vector(7),
+ vector(8),
+ vector(9),
+ vector(10),
+ vector(11),
+ /* vector(12), -- IV of less than 16bytes are not supported */
+};
+
+#endif /* AES_GCM_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
new file mode 100644
index 00000000..9f7a1307
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
@@ -0,0 +1,320 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+%macro key_expansion_128_sse 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+%endmacro
+
+%macro key_expansion_128_avx 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+
+; void aes_keyexp_128(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+global aes_keyexp_128_sse:function
+aes_keyexp_128_sse:
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory
+ pxor xmm3, xmm3
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*1], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*9], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*3], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*7], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*5], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*5], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*7], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*3], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*9], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*1], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global aes_keyexp_128_avx:function
+aes_keyexp_128_avx:
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory
+ vpxor xmm3, xmm3, xmm3
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*9], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*7], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*5], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*3], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*1], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void aes_keyexp_128_enc_sse(UINT8 *key,
+; UINT8 *enc_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+;
+global aes_keyexp_128_enc_sse:function
+aes_keyexp_128_enc_sse:
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ pxor xmm3, xmm3
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*1], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*3], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*5], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*7], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*9], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+
+ ret
+
+global aes_keyexp_128_enc_avx:function
+aes_keyexp_128_enc_avx:
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vpxor xmm3, xmm3, xmm3
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+
+ ret
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
new file mode 100644
index 00000000..33fcef83
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
@@ -0,0 +1,268 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+
+
+
+%macro key_expansion_1_192_sse 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_sse 1
+ movdqu xmm5, xmm4
+ pslldq xmm5, 4
+ shufps xmm6, xmm1, 11110000b
+ pxor xmm6, xmm5
+ pxor xmm4, xmm6
+ pshufd xmm7, xmm4, 00001110b
+ movdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_sse 1
+ movdqu xmm0, [EXP_ENC_KEYS + 16 * %1]
+ aesimc xmm1, xmm0
+ movdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+
+%macro key_expansion_1_192_avx 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vmovdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_avx 1
+ vmovdqa xmm5, xmm4
+ vpslldq xmm5, xmm5, 4
+ vshufps xmm6, xmm6, xmm1, 11110000b
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vpshufd xmm7, xmm4, 00001110b
+ vmovdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_avx 1
+ vmovdqu xmm0, [EXP_ENC_KEYS + 16 * %1]
+ vaesimc xmm1, xmm0
+ vmovdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+; void aes_keyexp_192(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+global aes_keyexp_192_sse:function
+aes_keyexp_192_sse:
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ sub rsp, 16*2 + 8
+ movdqu [rsp + 0*16], xmm6
+ movdqu [rsp + 1*16], xmm7
+%endif
+
+ movq xmm7, [KEY + 16] ; loading the AES key, 64 bits
+ movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion
+ pshufd xmm4, xmm7, 01001111b
+ movdqu xmm1, [KEY] ; loading the AES key, 128 bits
+ movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+ pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion.
+ pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192_sse 24
+ key_expansion_2_192_sse 40
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192_sse 48
+ key_expansion_2_192_sse 64
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192_sse 72
+ key_expansion_2_192_sse 88
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192_sse 96
+ key_expansion_2_192_sse 112
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192_sse 120
+ key_expansion_2_192_sse 136
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192_sse 144
+ key_expansion_2_192_sse 160
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192_sse 168
+ key_expansion_2_192_sse 184
+
+ aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192_sse 192
+
+;;; we have already saved the 12 th key, which is pure input on the
+;;; ENC key path
+ movdqu xmm0, [EXP_ENC_KEYS + 16 * 12]
+ movdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;; generate remaining decrypt keys
+ key_dec_192_sse 1
+ key_dec_192_sse 2
+ key_dec_192_sse 3
+ key_dec_192_sse 4
+ key_dec_192_sse 5
+ key_dec_192_sse 6
+ key_dec_192_sse 7
+ key_dec_192_sse 8
+ key_dec_192_sse 9
+ key_dec_192_sse 10
+ key_dec_192_sse 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ movdqu xmm6, [rsp + 0*16]
+ movdqu xmm7, [rsp + 1*16]
+ add rsp, 16*2 + 8
+%endif
+
+ ret
+
+
+
+global aes_keyexp_192_avx:function
+aes_keyexp_192_avx:
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ sub rsp, 16*2 + 8
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+%endif
+
+ vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits
+ vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion
+ vpshufd xmm4, xmm7, 01001111b
+ vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits
+ vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+ vpxor xmm3, xmm3, xmm3
+ vpxor xmm6, xmm6, xmm6
+
+ vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192_avx 24
+ key_expansion_2_192_avx 40
+
+ vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192_avx 48
+ key_expansion_2_192_avx 64
+
+ vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192_avx 72
+ key_expansion_2_192_avx 88
+
+ vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192_avx 96
+ key_expansion_2_192_avx 112
+
+ vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192_avx 120
+ key_expansion_2_192_avx 136
+
+ vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192_avx 144
+ key_expansion_2_192_avx 160
+
+ vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192_avx 168
+ key_expansion_2_192_avx 184
+
+ vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192_avx 192
+
+;;; we have already saved the 12 th key, which is pure input on the
+;;; ENC key path
+ vmovdqu xmm0, [EXP_ENC_KEYS + 16 * 12]
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;; generate remaining decrypt keys
+ key_dec_192_avx 1
+ key_dec_192_avx 2
+ key_dec_192_avx 3
+ key_dec_192_avx 4
+ key_dec_192_avx 5
+ key_dec_192_avx 6
+ key_dec_192_avx 7
+ key_dec_192_avx 8
+ key_dec_192_avx 9
+ key_dec_192_avx 10
+ key_dec_192_avx 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ add rsp, 16*2 + 8
+%endif
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
new file mode 100644
index 00000000..ff08cd0f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
@@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_sse 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_sse_2 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 10101010b
+ shufps xmm3, xmm4, 00010000b
+ pxor xmm4, xmm3
+ shufps xmm3, xmm4, 10001100b
+ pxor xmm4, xmm3
+ pxor xmm4, xmm2
+%endmacro
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_avx 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_avx_2 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 10101010b
+ vshufps xmm3, xmm3, xmm4, 00010000b
+ vpxor xmm4, xmm4, xmm3
+ vshufps xmm3, xmm3, xmm4, 10001100b
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm4, xmm4, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+; void aes_keyexp_256(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+global aes_keyexp_256_sse:function
+aes_keyexp_256_sse:
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory
+
+ movdqu xmm4, [KEY+16] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*1], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory
+
+ pxor xmm3, xmm3 ; Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*12], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*3], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*11], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*10], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*5], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*9], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*7], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*7], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*9], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*5], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*11], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*3], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*12], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*13], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*1], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*14], xmm1
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+
+global aes_keyexp_256_avx:function
+aes_keyexp_256_avx:
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory
+
+ vmovdqu xmm4, [KEY+16] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory
+
+ vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion.
+
+ vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*12], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*11], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*10], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*9], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*7], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*5], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*11], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*3], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*12], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*13], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*1], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*14], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
new file mode 100644
index 00000000..698ea4b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern aes_keyexp_128_sse
+extern aes_keyexp_128_avx
+extern aes_keyexp_128_enc_sse
+extern aes_keyexp_128_enc_avx
+
+extern aes_keyexp_192_sse
+extern aes_keyexp_192_avx
+
+extern aes_keyexp_256_sse
+extern aes_keyexp_256_avx
+
+%include "multibinary.asm"
+
+
+;;;;
+; instantiate aes_keyexp_128 interfaces
+;;;;
+mbin_interface aes_keyexp_128
+mbin_dispatch_init aes_keyexp_128, aes_keyexp_128_sse, aes_keyexp_128_avx, aes_keyexp_128_avx
+
+mbin_interface aes_keyexp_128_enc
+mbin_dispatch_init aes_keyexp_128_enc, aes_keyexp_128_enc_sse, aes_keyexp_128_enc_avx, aes_keyexp_128_enc_avx
+
+mbin_interface aes_keyexp_192
+mbin_dispatch_init aes_keyexp_192, aes_keyexp_192_sse, aes_keyexp_192_avx, aes_keyexp_192_avx
+
+mbin_interface aes_keyexp_256
+mbin_dispatch_init aes_keyexp_256, aes_keyexp_256_sse, aes_keyexp_256_avx, aes_keyexp_256_avx
+
+section .text
+;;; func core, ver, snum
+slversion aes_keyexp_128, 00, 01, 02a1
+slversion aes_keyexp_192, 00, 01, 02a2
+slversion aes_keyexp_256, 00, 01, 02a3
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
new file mode 100644
index 00000000..db71ebb8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
@@ -0,0 +1,300 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_OSSL_HELPER_H_
+#define AES_OSSL_HELPER_H_
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#include <openssl/evp.h>
+
+static inline
+ int openssl_aes_128_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX ctx;
+
+ EVP_CIPHER_CTX_init(&ctx);
+ if (!EVP_DecryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(&ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate(&ctx, plaintext, &outlen, (uint8_t const *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_DecryptFinal_ex(&ctx, &plaintext[outlen], &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_128_cbc %x, %x, %x\n", len,
+ outlen, tmplen);
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_128_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX ctx;
+
+ EVP_CIPHER_CTX_init(&ctx);
+ if (!EVP_EncryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(&ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (&ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(&ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_192_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_192_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_192_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_192_cbc \n");
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_192_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_192_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_192_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_192_cbc\n");
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_256_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_256_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_256_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_256_cbc %x,%x\n", outlen,
+ tmplen);
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_256_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_256_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_256_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_256_cbc\n");
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+ int len, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = len, ret;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+ if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+
+ ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+ if (0 < ret) {
+ tmplen += outlen;
+ } else {
+ //Authentication failed mismatched key, ADD or tag
+ tmplen = -1;
+ }
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+ int len, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ //printf("ivl:%x addl:%x tagl:%x ptl:%x\n", iv_len, aad_len, tag_len, len);
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+ if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+ if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_256_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+ int len, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = len, ret;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+ if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+ if (0 < ret) {
+ tmplen += outlen;
+ } else {
+ //Authentication failed mismatched key, ADD or tag
+ tmplen = -1;
+ }
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_256_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+ int len, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+ if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+ if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+ return tmplen;
+}
+
+#endif /* AES_OSSL_HELPER_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
new file mode 100644
index 00000000..bfde3557
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
@@ -0,0 +1,141 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, TEST_LEN))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt, *refdt;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+ struct perf start, stop;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_128_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 16; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 16] = key2[i];
+ }
+
+ /* Encrypt and compare decrypted output */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, ct, refdt);
+ if (memcmp(dt, refdt, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ perf_stop(&stop);
+ printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, ct, refdt);
+ perf_stop(&stop);
+ printf("aes_xts_128_openssl_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
new file mode 100644
index 00000000..5f41f064
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ printf("aes_xts_128_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
new file mode 100644
index 00000000..7c6a445e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
@@ -0,0 +1,142 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts128_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *refct;
+ struct perf start, stop;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_128_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == refct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts128_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 16; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 16] = key2[i];
+ }
+
+ /* Encrypt and compare output */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ if (memcmp(ct, refct, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ perf_stop(&stop);
+
+ printf("aes_xts_128_openssl_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
new file mode 100644
index 00000000..1fce1665
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
@@ -0,0 +1,100 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct;
+
+ printf("aes_xts_128_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
new file mode 100644
index 00000000..3b294ef7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
@@ -0,0 +1,116 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+ // Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+ // needed for the tweak part of the decryption)
+ uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+ uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ printf("Can't allocate ciphertext memory\n");
+ return -1;
+ }
+ // Pre-expand keys (will only use the encryption ones here)
+ aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+ XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ // Vectors 1-10 and 15-19 are for the 128 bit code
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ printf("Can't allocate plaintext memory\n");
+ return -1;
+ }
+ // Pre-expand keys for the decryption
+ aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+ // Note, encryption key is re-used for the tweak decryption step
+ XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+ printf(" failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
new file mode 100644
index 00000000..34498082
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
@@ -0,0 +1,247 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int t, n;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ int align, size, min_size;
+ unsigned char *efence_pt;
+ unsigned char *efence_ct;
+ unsigned char *efence_dt;
+
+ unsigned char *origin_pt;
+ unsigned char *origin_ct;
+ unsigned char *origin_dt;
+
+ unsigned char key1_exp_enc[16 * 11], key1_exp_dec[16 * 11];
+ unsigned char key2_exp_tw[16 * 11];
+ int i;
+
+ printf("aes_xts_128 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ if (memcmp(pt, dt, TEST_LEN)) {
+ printf("fail\n");
+ return -1;
+ }
+ putchar('.');
+
+ // Do tests with random data, keys and message size
+ for (t = 0; t < RANDOMS; t++) {
+ n = rand() % (TEST_LEN);
+ if (n < 17)
+ continue;
+
+ mk_rand_data(key1, key2, tinit, pt, n);
+ XTS_AES_128_enc(key2, key1, tinit, n, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, n, ct, dt);
+
+ if (memcmp(pt, dt, n)) {
+ printf("fail rand %d, size %d\n", t, n);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // Run tests at end of buffer for Electric Fence
+ align = 1;
+ min_size = 16;
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ origin_pt = malloc(TEST_LEN);
+ origin_ct = malloc(TEST_LEN);
+ origin_dt = malloc(TEST_LEN);
+ if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ for (i = 0; i < 16 * 11; i++) {
+ key2_exp_tw[i] = rand();
+ }
+
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
new file mode 100644
index 00000000..aaed9347
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
@@ -0,0 +1,207 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_LOOPS 100
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+
+/* Generates random data for keys, tweak and plaintext */
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 encryption */
+static inline
+ int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 decryption */
+static inline
+ int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt, *refct, *refdt;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+ unsigned int rand_len, t;
+ int i, j, k;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ /* Allocate space for input and output buffers */
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /**************************** FIXED LENGTH TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+ // Loop over the vectors
+ for (i = 0; i < TEST_LOOPS; i++) {
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_128_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, TEST_LEN, refct, refdt);
+
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_128_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ /**************************** RANDOM LENGTH TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+ /* Run tests with random size */
+
+ for (t = 0; t < RANDOMS; t++) {
+
+ rand_len = rand() % (TEST_LEN);
+ mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, rand_len, pt, ct);
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, rand_len, pt, refct);
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < rand_len; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_128_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, rand_len, ct, dt);
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, rand_len, refct, refdt);
+
+ for (j = 0; j < rand_len; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_128_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ printf("aes_xts_128_rand_ossl: All tests passed\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
new file mode 100644
index 00000000..4092d2dd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
@@ -0,0 +1,106 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return -1;
+ }
+
+ XTS_AES_128_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ // Vectors 1-10 and 15-19 are for the 128 bit code
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return -1;
+ }
+
+ XTS_AES_128_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+
+ printf(" failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
new file mode 100644
index 00000000..55a53bc2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
@@ -0,0 +1,1691 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 14
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+ uint64_t ptlen; // length of our plaintext
+ uint8_t *key1; // dimension 16 for 128 bit aes
+ uint8_t *key2; // dimension 16 for 128 bit aes
+ uint8_t *TW; // dimension 16 for both 128 and 256 bit
+ uint8_t *PTX; // min. dimension 16
+ uint8_t *CTX; // same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 1
+ * Key1 00000000000000000000000000000000
+ * Key2 00000000000000000000000000000000
+ * Data Unit Sequence number 0
+ * PTX 0000000000000000000000000000000000000000000000000000000000000000 /128bit
+ * TWK 66e94bd4ef8a2c3b884cfa59ca342b2eccd297a8df1559761099f4b39469565c
+ * CTX 917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v1_key1[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_key2[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_TW[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_PTX[32] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_CTX[32] = {
+ 0x91, 0x7c, 0xf6, 0x9e, 0xbd, 0x68, 0xb2, 0xec,
+ 0x9b, 0x9f, 0xe9, 0xa3, 0xea, 0xdd, 0xa6, 0x92,
+ 0xcd, 0x43, 0xd2, 0xf5, 0x95, 0x98, 0xed, 0x85,
+ 0x8c, 0x02, 0xc2, 0x65, 0x2f, 0xbf, 0x92, 0x2e
+};
+
+/*
+ * Vector 2
+ * Key1 11111111111111111111111111111111
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v2_key1[16] = {
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11
+};
+
+static uint8_t v2_key2[16] = {
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v2_TW[16] = {
+ 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v2_PTX[32] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v2_CTX[32] = {
+ 0xc4, 0x54, 0x18, 0x5e, 0x6a, 0x16, 0x93, 0x6e,
+ 0x39, 0x33, 0x40, 0x38, 0xac, 0xef, 0x83, 0x8b,
+ 0xfb, 0x18, 0x6f, 0xff, 0x74, 0x80, 0xad, 0xc4,
+ 0x28, 0x93, 0x82, 0xec, 0xd6, 0xd3, 0x94, 0xf0
+};
+
+/*
+ * Vector 3
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v3_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v3_key2[16] = {
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v3_TW[16] = {
+ 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v3_PTX[32] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v3_CTX[32] = {
+ 0xaf, 0x85, 0x33, 0x6b, 0x59, 0x7a, 0xfc, 0x1a,
+ 0x90, 0x0b, 0x2e, 0xb2, 0x1e, 0xc9, 0x49, 0xd2,
+ 0x92, 0xdf, 0x4c, 0x04, 0x7e, 0x0b, 0x21, 0x53,
+ 0x21, 0x86, 0xa5, 0x97, 0x1a, 0x22, 0x7a, 0x89
+};
+
+/*
+ * Vector 4
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence number 0
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * CTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * CTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * CTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * CTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * CTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * CTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * CTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * CTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * CTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * CTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * CTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * CTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * CTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * CTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * CTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v4_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v4_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v4_TW[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v4_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v4_CTX[512] = {
+ 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+ 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+ 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+ 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+ 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+ 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+ 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+ 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+ 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+ 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+ 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+ 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+ 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+ 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+ 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+ 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+ 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+ 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+ 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+ 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+ 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+ 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+ 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+ 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+ 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+ 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+ 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+ 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+ 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+ 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+ 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+ 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+ 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+ 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+ 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+ 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+ 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+ 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+ 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+ 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+ 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+ 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+ 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+ 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+ 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+ 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+ 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+ 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+ 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+ 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+ 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+ 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+ 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+ 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+ 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+ 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+ 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+ 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+ 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+ 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+ 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+ 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+ 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+ 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+/*
+ * Vector 5
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 01
+ * PTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * PTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * PTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * PTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * PTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * PTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * PTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * PTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * PTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * PTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * PTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * PTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * PTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * PTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * PTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * PTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * CTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * CTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * CTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * CTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * CTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * CTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * CTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * CTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * CTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * CTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * CTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * CTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * CTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * CTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * CTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * CTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v5_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v5_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v5_TW[16] = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v5_PTX[512] = {
+ 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+ 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+ 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+ 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+ 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+ 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+ 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+ 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+ 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+ 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+ 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+ 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+ 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+ 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+ 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+ 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+ 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+ 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+ 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+ 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+ 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+ 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+ 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+ 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+ 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+ 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+ 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+ 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+ 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+ 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+ 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+ 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+ 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+ 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+ 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+ 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+ 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+ 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+ 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+ 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+ 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+ 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+ 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+ 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+ 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+ 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+ 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+ 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+ 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+ 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+ 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+ 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+ 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+ 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+ 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+ 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+ 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+ 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+ 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+ 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+ 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+ 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+ 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+ 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+static uint8_t v5_CTX[512] = {
+ 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+ 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+ 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+ 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+ 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+ 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+ 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+ 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+ 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+ 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+ 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+ 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+ 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+ 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+ 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+ 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+ 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+ 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+ 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+ 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+ 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+ 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+ 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+ 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+ 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+ 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+ 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+ 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+ 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+ 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+ 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+ 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+ 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+ 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+ 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+ 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+ 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+ 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+ 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+ 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+ 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+ 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+ 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+ 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+ 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+ 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+ 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+ 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+ 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+ 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+ 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+ 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+ 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+ 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+ 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+ 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+ 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+ 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+ 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+ 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+ 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+ 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+ 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+ 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+/*
+ * Vector 6
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 02
+ * PTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * PTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * PTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * PTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * PTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * PTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * PTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * PTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * PTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * PTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * PTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * PTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * PTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * PTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * PTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * PTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * CTX fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e
+ * CTX 18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9
+ * CTX a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62
+ * CTX 344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7
+ * CTX 946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb3
+ * CTX 86fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec
+ * CTX 26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791
+ * CTX a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f
+ * CTX 15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec54
+ * CTX 5365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c
+ * CTX 874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b151105
+ * CTX 19655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159
+ * CTX 625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab
+ * CTX 584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60
+ * CTX 401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a42
+ * CTX 31864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v6_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v6_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v6_TW[16] = {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v6_PTX[512] = {
+
+ 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+ 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+ 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+ 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+ 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+ 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+ 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+ 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+ 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+ 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+ 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+ 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+ 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+ 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+ 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+ 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+ 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+ 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+ 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+ 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+ 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+ 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+ 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+ 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+ 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+ 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+ 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+ 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+ 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+ 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+ 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+ 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+ 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+ 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+ 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+ 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+ 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+ 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+ 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+ 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+ 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+ 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+ 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+ 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+ 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+ 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+ 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+ 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+ 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+ 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+ 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+ 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+ 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+ 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+ 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+ 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+ 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+ 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+ 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+ 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+ 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+ 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+ 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+ 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+static uint8_t v6_CTX[512] = {
+
+ 0xfa, 0x76, 0x2a, 0x36, 0x80, 0xb7, 0x60, 0x07,
+ 0x92, 0x8e, 0xd4, 0xa4, 0xf4, 0x9a, 0x94, 0x56,
+ 0x03, 0x1b, 0x70, 0x47, 0x82, 0xe6, 0x5e, 0x16,
+ 0xce, 0xcb, 0x54, 0xed, 0x7d, 0x01, 0x7b, 0x5e,
+ 0x18, 0xab, 0xd6, 0x7b, 0x33, 0x8e, 0x81, 0x07,
+ 0x8f, 0x21, 0xed, 0xb7, 0x86, 0x8d, 0x90, 0x1e,
+ 0xbe, 0x9c, 0x73, 0x1a, 0x7c, 0x18, 0xb5, 0xe6,
+ 0xde, 0xc1, 0xd6, 0xa7, 0x2e, 0x07, 0x8a, 0xc9,
+ 0xa4, 0x26, 0x2f, 0x86, 0x0b, 0xee, 0xfa, 0x14,
+ 0xf4, 0xe8, 0x21, 0x01, 0x82, 0x72, 0xe4, 0x11,
+ 0xa9, 0x51, 0x50, 0x2b, 0x6e, 0x79, 0x06, 0x6e,
+ 0x84, 0x25, 0x2c, 0x33, 0x46, 0xf3, 0xaa, 0x62,
+ 0x34, 0x43, 0x51, 0xa2, 0x91, 0xd4, 0xbe, 0xdc,
+ 0x7a, 0x07, 0x61, 0x8b, 0xde, 0xa2, 0xaf, 0x63,
+ 0x14, 0x5c, 0xc7, 0xa4, 0xb8, 0xd4, 0x07, 0x06,
+ 0x91, 0xae, 0x89, 0x0c, 0xd6, 0x57, 0x33, 0xe7,
+ 0x94, 0x6e, 0x90, 0x21, 0xa1, 0xdf, 0xfc, 0x4c,
+ 0x59, 0xf1, 0x59, 0x42, 0x5e, 0xe6, 0xd5, 0x0c,
+ 0xa9, 0xb1, 0x35, 0xfa, 0x61, 0x62, 0xce, 0xa1,
+ 0x8a, 0x93, 0x98, 0x38, 0xdc, 0x00, 0x0f, 0xb3,
+ 0x86, 0xfa, 0xd0, 0x86, 0xac, 0xce, 0x5a, 0xc0,
+ 0x7c, 0xb2, 0xec, 0xe7, 0xfd, 0x58, 0x0b, 0x00,
+ 0xcf, 0xa5, 0xe9, 0x85, 0x89, 0x63, 0x1d, 0xc2,
+ 0x5e, 0x8e, 0x2a, 0x3d, 0xaf, 0x2f, 0xfd, 0xec,
+ 0x26, 0x53, 0x16, 0x59, 0x91, 0x2c, 0x9d, 0x8f,
+ 0x7a, 0x15, 0xe5, 0x86, 0x5e, 0xa8, 0xfb, 0x58,
+ 0x16, 0xd6, 0x20, 0x70, 0x52, 0xbd, 0x71, 0x28,
+ 0xcd, 0x74, 0x3c, 0x12, 0xc8, 0x11, 0x87, 0x91,
+ 0xa4, 0x73, 0x68, 0x11, 0x93, 0x5e, 0xb9, 0x82,
+ 0xa5, 0x32, 0x34, 0x9e, 0x31, 0xdd, 0x40, 0x1e,
+ 0x0b, 0x66, 0x0a, 0x56, 0x8c, 0xb1, 0xa4, 0x71,
+ 0x1f, 0x55, 0x2f, 0x55, 0xde, 0xd5, 0x9f, 0x1f,
+ 0x15, 0xbf, 0x71, 0x96, 0xb3, 0xca, 0x12, 0xa9,
+ 0x1e, 0x48, 0x8e, 0xf5, 0x9d, 0x64, 0xf3, 0xa0,
+ 0x2b, 0xf4, 0x52, 0x39, 0x49, 0x9a, 0xc6, 0x17,
+ 0x6a, 0xe3, 0x21, 0xc4, 0xa2, 0x11, 0xec, 0x54,
+ 0x53, 0x65, 0x97, 0x1c, 0x5d, 0x3f, 0x4f, 0x09,
+ 0xd4, 0xeb, 0x13, 0x9b, 0xfd, 0xf2, 0x07, 0x3d,
+ 0x33, 0x18, 0x0b, 0x21, 0x00, 0x2b, 0x65, 0xcc,
+ 0x98, 0x65, 0xe7, 0x6c, 0xb2, 0x4c, 0xd9, 0x2c,
+ 0x87, 0x4c, 0x24, 0xc1, 0x83, 0x50, 0x39, 0x9a,
+ 0x93, 0x6a, 0xb3, 0x63, 0x70, 0x79, 0x29, 0x5d,
+ 0x76, 0xc4, 0x17, 0x77, 0x6b, 0x94, 0xef, 0xce,
+ 0x3a, 0x0e, 0xf7, 0x20, 0x6b, 0x15, 0x11, 0x05,
+ 0x19, 0x65, 0x5c, 0x95, 0x6c, 0xbd, 0x8b, 0x24,
+ 0x89, 0x40, 0x5e, 0xe2, 0xb0, 0x9a, 0x6b, 0x6e,
+ 0xeb, 0xe0, 0xc5, 0x37, 0x90, 0xa1, 0x2a, 0x89,
+ 0x98, 0x37, 0x8b, 0x33, 0xa5, 0xb7, 0x11, 0x59,
+ 0x62, 0x5f, 0x4b, 0xa4, 0x9d, 0x2a, 0x2f, 0xdb,
+ 0xa5, 0x9f, 0xbf, 0x08, 0x97, 0xbc, 0x7a, 0xab,
+ 0xd8, 0xd7, 0x07, 0xdc, 0x14, 0x0a, 0x80, 0xf0,
+ 0xf3, 0x09, 0xf8, 0x35, 0xd3, 0xda, 0x54, 0xab,
+ 0x58, 0x4e, 0x50, 0x1d, 0xfa, 0x0e, 0xe9, 0x77,
+ 0xfe, 0xc5, 0x43, 0xf7, 0x41, 0x86, 0xa8, 0x02,
+ 0xb9, 0xa3, 0x7a, 0xdb, 0x3e, 0x82, 0x91, 0xec,
+ 0xa0, 0x4d, 0x66, 0x52, 0x0d, 0x22, 0x9e, 0x60,
+ 0x40, 0x1e, 0x72, 0x82, 0xbe, 0xf4, 0x86, 0xae,
+ 0x05, 0x9a, 0xa7, 0x06, 0x96, 0xe0, 0xe3, 0x05,
+ 0xd7, 0x77, 0x14, 0x0a, 0x7a, 0x88, 0x3e, 0xcd,
+ 0xcb, 0x69, 0xb9, 0xff, 0x93, 0x8e, 0x8a, 0x42,
+ 0x31, 0x86, 0x4c, 0x69, 0xca, 0x2c, 0x20, 0x43,
+ 0xbe, 0xd0, 0x07, 0xff, 0x3e, 0x60, 0x5e, 0x01,
+ 0x4b, 0xcf, 0x51, 0x81, 0x38, 0xdc, 0x3a, 0x25,
+ 0xc5, 0xe2, 0x36, 0x17, 0x1a, 0x2d, 0x01, 0xd6
+};
+
+/*
+ * Vector 7
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fd
+ * PTX 8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45d
+ * PTX aad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731a
+ * PTX ad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3
+ * PTX 693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46
+ * PTX cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db6026
+ * PTX 7996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18
+ * PTX dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f
+ * PTX 6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e0338
+ * PTX 33fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641
+ * PTX d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd80
+ * PTX 23c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b
+ * PTX 637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a
+ * PTX 5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a80
+ * PTX 5c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca72
+ * PTX 1b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292
+ * PTX d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888
+ * CTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * CTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * CTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * CTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * CTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * CTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * CTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * CTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * CTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * CTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * CTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * CTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * CTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * CTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * CTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * CTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v7_key1[16] = {
+
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v7_key2[16] = {
+
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v7_TW[16] = {
+
+ 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v7_PTX[512] = {
+
+ 0x8e, 0x41, 0xb7, 0x8c, 0x39, 0x0b, 0x5a, 0xf9,
+ 0xd7, 0x58, 0xbb, 0x21, 0x4a, 0x67, 0xe9, 0xf6,
+ 0xbf, 0x77, 0x27, 0xb0, 0x9a, 0xc6, 0x12, 0x40,
+ 0x84, 0xc3, 0x76, 0x11, 0x39, 0x8f, 0xa4, 0x5d,
+ 0xaa, 0xd9, 0x48, 0x68, 0x60, 0x0e, 0xd3, 0x91,
+ 0xfb, 0x1a, 0xcd, 0x48, 0x57, 0xa9, 0x5b, 0x46,
+ 0x6e, 0x62, 0xef, 0x9f, 0x4b, 0x37, 0x72, 0x44,
+ 0xd1, 0xc1, 0x52, 0xe7, 0xb3, 0x0d, 0x73, 0x1a,
+ 0xad, 0x30, 0xc7, 0x16, 0xd2, 0x14, 0xb7, 0x07,
+ 0xae, 0xd9, 0x9e, 0xb5, 0xb5, 0xe5, 0x80, 0xb3,
+ 0xe8, 0x87, 0xcf, 0x74, 0x97, 0x46, 0x56, 0x51,
+ 0xd4, 0xb6, 0x0e, 0x60, 0x42, 0x05, 0x1d, 0xa3,
+ 0x69, 0x3c, 0x3b, 0x78, 0xc1, 0x44, 0x89, 0x54,
+ 0x3b, 0xe8, 0xb6, 0xad, 0x0b, 0xa6, 0x29, 0x56,
+ 0x5b, 0xba, 0x20, 0x23, 0x13, 0xba, 0x7b, 0x0d,
+ 0x0c, 0x94, 0xa3, 0x25, 0x2b, 0x67, 0x6f, 0x46,
+ 0xcc, 0x02, 0xce, 0x0f, 0x8a, 0x7d, 0x34, 0xc0,
+ 0xed, 0x22, 0x91, 0x29, 0x67, 0x3c, 0x1f, 0x61,
+ 0xae, 0xd5, 0x79, 0xd0, 0x8a, 0x92, 0x03, 0xa2,
+ 0x5a, 0xac, 0x3a, 0x77, 0xe9, 0xdb, 0x60, 0x26,
+ 0x79, 0x96, 0xdb, 0x38, 0xdf, 0x63, 0x73, 0x56,
+ 0xd9, 0xdc, 0xd1, 0x63, 0x2e, 0x36, 0x99, 0x39,
+ 0xf2, 0xa2, 0x9d, 0x89, 0x34, 0x5c, 0x66, 0xe0,
+ 0x50, 0x66, 0xf1, 0xa3, 0x67, 0x7a, 0xef, 0x18,
+ 0xde, 0xa4, 0x11, 0x3f, 0xae, 0xb6, 0x29, 0xe4,
+ 0x67, 0x21, 0xa6, 0x6d, 0x0a, 0x7e, 0x78, 0x5d,
+ 0x3e, 0x29, 0xaf, 0x25, 0x94, 0xeb, 0x67, 0xdf,
+ 0xa9, 0x82, 0xaf, 0xfe, 0x0a, 0xac, 0x05, 0x8f,
+ 0x6e, 0x15, 0x86, 0x42, 0x69, 0xb1, 0x35, 0x41,
+ 0x82, 0x61, 0xfc, 0x3a, 0xfb, 0x08, 0x94, 0x72,
+ 0xcf, 0x68, 0xc4, 0x5d, 0xd7, 0xf2, 0x31, 0xc6,
+ 0x24, 0x9b, 0xa0, 0x25, 0x5e, 0x1e, 0x03, 0x38,
+ 0x33, 0xfc, 0x4d, 0x00, 0xa3, 0xfe, 0x02, 0x13,
+ 0x2d, 0x7b, 0xc3, 0x87, 0x36, 0x14, 0xb8, 0xae,
+ 0xe3, 0x42, 0x73, 0x58, 0x1e, 0xa0, 0x32, 0x5c,
+ 0x81, 0xf0, 0x27, 0x0a, 0xff, 0xa1, 0x36, 0x41,
+ 0xd0, 0x52, 0xd3, 0x6f, 0x07, 0x57, 0xd4, 0x84,
+ 0x01, 0x43, 0x54, 0xd0, 0x2d, 0x68, 0x83, 0xca,
+ 0x15, 0xc2, 0x4d, 0x8c, 0x39, 0x56, 0xb1, 0xbd,
+ 0x02, 0x7b, 0xcf, 0x41, 0xf1, 0x51, 0xfd, 0x80,
+ 0x23, 0xc5, 0x34, 0x0e, 0x56, 0x06, 0xf3, 0x7e,
+ 0x90, 0xfd, 0xb8, 0x7c, 0x86, 0xfb, 0x4f, 0xa6,
+ 0x34, 0xb3, 0x71, 0x8a, 0x30, 0xba, 0xce, 0x06,
+ 0xa6, 0x6e, 0xaf, 0x8f, 0x63, 0xc4, 0xaa, 0x3b,
+ 0x63, 0x78, 0x26, 0xa8, 0x7f, 0xe8, 0xcf, 0xa4,
+ 0x42, 0x82, 0xe9, 0x2c, 0xb1, 0x61, 0x5a, 0xf3,
+ 0xa2, 0x8e, 0x53, 0xbc, 0x74, 0xc7, 0xcb, 0xa1,
+ 0xa0, 0x97, 0x7b, 0xe9, 0x06, 0x5d, 0x0c, 0x1a,
+ 0x5d, 0xec, 0x6c, 0x54, 0xae, 0x38, 0xd3, 0x7f,
+ 0x37, 0xaa, 0x35, 0x28, 0x3e, 0x04, 0x8e, 0x55,
+ 0x30, 0xa8, 0x5c, 0x4e, 0x7a, 0x29, 0xd7, 0xb9,
+ 0x2e, 0xc0, 0xc3, 0x16, 0x9c, 0xdf, 0x2a, 0x80,
+ 0x5c, 0x76, 0x04, 0xbc, 0xe6, 0x00, 0x49, 0xb9,
+ 0xfb, 0x7b, 0x8e, 0xaa, 0xc1, 0x0f, 0x51, 0xae,
+ 0x23, 0x79, 0x4c, 0xeb, 0xa6, 0x8b, 0xb5, 0x81,
+ 0x12, 0xe2, 0x93, 0xb9, 0xb6, 0x92, 0xca, 0x72,
+ 0x1b, 0x37, 0xc6, 0x62, 0xf8, 0x57, 0x4e, 0xd4,
+ 0xdb, 0xa6, 0xf8, 0x8e, 0x17, 0x08, 0x81, 0xc8,
+ 0x2c, 0xdd, 0xc1, 0x03, 0x4a, 0x0c, 0xa7, 0xe2,
+ 0x84, 0xbf, 0x09, 0x62, 0xb6, 0xb2, 0x62, 0x92,
+ 0xd8, 0x36, 0xfa, 0x9f, 0x73, 0xc1, 0xac, 0x77,
+ 0x0e, 0xef, 0x0f, 0x2d, 0x3a, 0x1e, 0xaf, 0x61,
+ 0xd3, 0xe0, 0x35, 0x55, 0xfd, 0x42, 0x4e, 0xed,
+ 0xd6, 0x7e, 0x18, 0xa1, 0x80, 0x94, 0xf8, 0x88
+};
+
+static uint8_t v7_CTX[512] = {
+
+ 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+ 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+ 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+ 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+ 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+ 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+ 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+ 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+ 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+ 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+ 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+ 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+ 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+ 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+ 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+ 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+ 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+ 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+ 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+ 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+ 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+ 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+ 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+ 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+ 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+ 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+ 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+ 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+ 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+ 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+ 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+ 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+ 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+ 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+ 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+ 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+ 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+ 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+ 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+ 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+ 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+ 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+ 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+ 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+ 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+ 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+ 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+ 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+ 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+ 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+ 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+ 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+ 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+ 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+ 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+ 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+ 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+ 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+ 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+ 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+ 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+ 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+ 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+ 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+/*
+ * Vector 8
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fe
+ * PTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * PTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * PTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * PTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * PTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * PTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * PTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * PTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * PTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * PTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * PTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * PTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * PTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * PTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * PTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * PTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * CTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * CTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * CTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * CTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * CTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * CTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * CTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * CTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * CTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * CTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * CTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * CTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * CTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * CTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * CTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * CTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v8_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v8_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v8_TW[16] = {
+ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v8_PTX[512] = {
+ 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+ 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+ 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+ 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+ 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+ 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+ 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+ 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+ 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+ 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+ 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+ 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+ 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+ 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+ 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+ 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+ 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+ 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+ 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+ 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+ 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+ 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+ 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+ 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+ 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+ 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+ 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+ 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+ 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+ 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+ 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+ 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+ 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+ 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+ 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+ 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+ 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+ 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+ 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+ 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+ 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+ 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+ 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+ 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+ 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+ 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+ 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+ 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+ 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+ 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+ 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+ 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+ 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+ 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+ 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+ 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+ 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+ 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+ 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+ 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+ 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+ 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+ 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+ 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+static uint8_t v8_CTX[512] = {
+ 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+ 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+ 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+ 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+ 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+ 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+ 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+ 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+ 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+ 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+ 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+ 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+ 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+ 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+ 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+ 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+ 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+ 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+ 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+ 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+ 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+ 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+ 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+ 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+ 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+ 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+ 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+ 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+ 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+ 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+ 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+ 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+ 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+ 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+ 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+ 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+ 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+ 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+ 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+ 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+ 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+ 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+ 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+ 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+ 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+ 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+ 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+ 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+ 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+ 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+ 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+ 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+ 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+ 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+ 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+ 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+ 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+ 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+ 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+ 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+ 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+ 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+ 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+ 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+/*
+ * Vector 9
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number ff
+ * PTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * PTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * PTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * PTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * PTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * PTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * PTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * PTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * PTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * PTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * PTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * PTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * PTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * PTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * PTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * PTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * CTX 3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553
+ * CTX 193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07a
+ * CTX f073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca
+ * CTX 12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58
+ * CTX b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf32542
+ * CTX 3bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a
+ * CTX 2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f9
+ * CTX 4ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46
+ * CTX d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c59037
+ * CTX 6e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866f
+ * CTX e9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac7734
+ * CTX 21e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82
+ * CTX 464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd3
+ * CTX 8675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcf
+ * CTX b3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a8
+ * CTX 4b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v9_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v9_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v9_TW[16] = {
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v9_PTX[512] = {
+ 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+ 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+ 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+ 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+ 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+ 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+ 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+ 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+ 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+ 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+ 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+ 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+ 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+ 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+ 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+ 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+ 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+ 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+ 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+ 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+ 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+ 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+ 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+ 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+ 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+ 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+ 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+ 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+ 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+ 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+ 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+ 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+ 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+ 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+ 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+ 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+ 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+ 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+ 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+ 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+ 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+ 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+ 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+ 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+ 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+ 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+ 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+ 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+ 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+ 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+ 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+ 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+ 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+ 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+ 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+ 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+ 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+ 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+ 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+ 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+ 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+ 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+ 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+ 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+static uint8_t v9_CTX[512] = {
+ 0x32, 0x60, 0xae, 0x8d, 0xad, 0x1f, 0x4a, 0x32,
+ 0xc5, 0xca, 0xfe, 0x3a, 0xb0, 0xeb, 0x95, 0x54,
+ 0x9d, 0x46, 0x1a, 0x67, 0xce, 0xb9, 0xe5, 0xaa,
+ 0x2d, 0x3a, 0xfb, 0x62, 0xde, 0xce, 0x05, 0x53,
+ 0x19, 0x3b, 0xa5, 0x0c, 0x75, 0xbe, 0x25, 0x1e,
+ 0x08, 0xd1, 0xd0, 0x8f, 0x10, 0x88, 0x57, 0x6c,
+ 0x7e, 0xfd, 0xfa, 0xaf, 0x3f, 0x45, 0x95, 0x59,
+ 0x57, 0x1e, 0x12, 0x51, 0x17, 0x53, 0xb0, 0x7a,
+ 0xf0, 0x73, 0xf3, 0x5d, 0xa0, 0x6a, 0xf0, 0xce,
+ 0x0b, 0xbf, 0x6b, 0x8f, 0x5c, 0xcc, 0x5c, 0xea,
+ 0x50, 0x0e, 0xc1, 0xb2, 0x11, 0xbd, 0x51, 0xf6,
+ 0x3b, 0x60, 0x6b, 0xf6, 0x52, 0x87, 0x96, 0xca,
+ 0x12, 0x17, 0x3b, 0xa3, 0x9b, 0x89, 0x35, 0xee,
+ 0x44, 0xcc, 0xce, 0x64, 0x6f, 0x90, 0xa4, 0x5b,
+ 0xf9, 0xcc, 0xc5, 0x67, 0xf0, 0xac, 0xe1, 0x3d,
+ 0xc2, 0xd5, 0x3e, 0xbe, 0xed, 0xc8, 0x1f, 0x58,
+ 0xb2, 0xe4, 0x11, 0x79, 0xdd, 0xdf, 0x0d, 0x5a,
+ 0x5c, 0x42, 0xf5, 0xd8, 0x50, 0x6c, 0x1a, 0x5d,
+ 0x2f, 0x8f, 0x59, 0xf3, 0xea, 0x87, 0x3c, 0xbc,
+ 0xd0, 0xee, 0xc1, 0x9a, 0xcb, 0xf3, 0x25, 0x42,
+ 0x3b, 0xd3, 0xdc, 0xb8, 0xc2, 0xb1, 0xbf, 0x1d,
+ 0x1e, 0xae, 0xd0, 0xeb, 0xa7, 0xf0, 0x69, 0x8e,
+ 0x43, 0x14, 0xfb, 0xeb, 0x2f, 0x15, 0x66, 0xd1,
+ 0xb9, 0x25, 0x30, 0x08, 0xcb, 0xcc, 0xf4, 0x5a,
+ 0x2b, 0x0d, 0x9c, 0x5c, 0x9c, 0x21, 0x47, 0x4f,
+ 0x40, 0x76, 0xe0, 0x2b, 0xe2, 0x60, 0x50, 0xb9,
+ 0x9d, 0xee, 0x4f, 0xd6, 0x8a, 0x4c, 0xf8, 0x90,
+ 0xe4, 0x96, 0xe4, 0xfc, 0xae, 0x7b, 0x70, 0xf9,
+ 0x4e, 0xa5, 0xa9, 0x06, 0x2d, 0xa0, 0xda, 0xeb,
+ 0xa1, 0x99, 0x3d, 0x2c, 0xcd, 0x1d, 0xd3, 0xc2,
+ 0x44, 0xb8, 0x42, 0x88, 0x01, 0x49, 0x5a, 0x58,
+ 0xb2, 0x16, 0x54, 0x7e, 0x7e, 0x84, 0x7c, 0x46,
+ 0xd1, 0xd7, 0x56, 0x37, 0x7b, 0x62, 0x42, 0xd2,
+ 0xe5, 0xfb, 0x83, 0xbf, 0x75, 0x2b, 0x54, 0xe0,
+ 0xdf, 0x71, 0xe8, 0x89, 0xf3, 0xa2, 0xbb, 0x0f,
+ 0x4c, 0x10, 0x80, 0x5b, 0xf3, 0xc5, 0x90, 0x37,
+ 0x6e, 0x3c, 0x24, 0xe2, 0x2f, 0xf5, 0x7f, 0x7f,
+ 0xa9, 0x65, 0x57, 0x73, 0x75, 0x32, 0x5c, 0xea,
+ 0x5d, 0x92, 0x0d, 0xb9, 0x4b, 0x9c, 0x33, 0x6b,
+ 0x45, 0x5f, 0x6e, 0x89, 0x4c, 0x01, 0x86, 0x6f,
+ 0xe9, 0xfb, 0xb8, 0xc8, 0xd3, 0xf7, 0x0a, 0x29,
+ 0x57, 0x28, 0x5f, 0x6d, 0xfb, 0x5d, 0xcd, 0x8c,
+ 0xbf, 0x54, 0x78, 0x2f, 0x8f, 0xe7, 0x76, 0x6d,
+ 0x47, 0x23, 0x81, 0x99, 0x13, 0xac, 0x77, 0x34,
+ 0x21, 0xe3, 0xa3, 0x10, 0x95, 0x86, 0x6b, 0xad,
+ 0x22, 0xc8, 0x6a, 0x60, 0x36, 0xb2, 0x51, 0x8b,
+ 0x20, 0x59, 0xb4, 0x22, 0x9d, 0x18, 0xc8, 0xc2,
+ 0xcc, 0xbd, 0xf9, 0x06, 0xc6, 0xcc, 0x6e, 0x82,
+ 0x46, 0x4e, 0xe5, 0x7b, 0xdd, 0xb0, 0xbe, 0xbc,
+ 0xb1, 0xdc, 0x64, 0x53, 0x25, 0xbf, 0xb3, 0xe6,
+ 0x65, 0xef, 0x72, 0x51, 0x08, 0x2c, 0x88, 0xeb,
+ 0xb1, 0xcf, 0x20, 0x3b, 0xd7, 0x79, 0xfd, 0xd3,
+ 0x86, 0x75, 0x71, 0x3c, 0x8d, 0xaa, 0xdd, 0x17,
+ 0xe1, 0xca, 0xbe, 0xe4, 0x32, 0xb0, 0x97, 0x87,
+ 0xb6, 0xdd, 0xf3, 0x30, 0x4e, 0x38, 0xb7, 0x31,
+ 0xb4, 0x5d, 0xf5, 0xdf, 0x51, 0xb7, 0x8f, 0xcf,
+ 0xb3, 0xd3, 0x24, 0x66, 0x02, 0x8d, 0x0b, 0xa3,
+ 0x65, 0x55, 0xe7, 0xe1, 0x1a, 0xb0, 0xee, 0x06,
+ 0x66, 0x06, 0x1d, 0x16, 0x45, 0xd9, 0x62, 0x44,
+ 0x4b, 0xc4, 0x7a, 0x38, 0x18, 0x89, 0x30, 0xa8,
+ 0x4b, 0x4d, 0x56, 0x13, 0x95, 0xc7, 0x3c, 0x08,
+ 0x70, 0x21, 0x92, 0x7c, 0xa6, 0x38, 0xb7, 0xaf,
+ 0xc8, 0xa8, 0x67, 0x9c, 0xcb, 0x84, 0xc2, 0x65,
+ 0x55, 0x44, 0x0e, 0xc7, 0xf1, 0x04, 0x45, 0xcd
+};
+
+/*
+ * Vector 15
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10
+ * CTX 6c1625db4671522d3d7599601de7ca09ed
+ * Plaintext length (bytes): 17
+ */
+
+static uint8_t v15_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v15_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v15_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v15_PTX[17] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10
+};
+
+static uint8_t v15_CTX[17] = {
+ 0x6c, 0x16, 0x25, 0xdb, 0x46, 0x71, 0x52, 0x2d,
+ 0x3d, 0x75, 0x99, 0x60, 0x1d, 0xe7, 0xca, 0x09,
+ 0xed
+};
+
+/*
+ * Vector 16
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f1011
+ * CTX d069444b7a7e0cab09e24447d24deb1fedbf
+ * Plaintext length (bytes): 18
+ */
+static uint8_t v16_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v16_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v16_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v16_PTX[18] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11
+};
+
+static uint8_t v16_CTX[18] = {
+ 0xd0, 0x69, 0x44, 0x4b, 0x7a, 0x7e, 0x0c, 0xab,
+ 0x09, 0xe2, 0x44, 0x47, 0xd2, 0x4d, 0xeb, 0x1f,
+ 0xed, 0xbf
+};
+
+/*
+ * Vector 17
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f101112
+ * CTX e5df1351c0544ba1350b3363cd8ef4beedbf9d
+ * Plaintext length (bytes): 19
+ */
+
+static uint8_t v17_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v17_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v17_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v17_PTX[19] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12
+};
+
+static uint8_t v17_CTX[19] = {
+ 0xe5, 0xdf, 0x13, 0x51, 0xc0, 0x54, 0x4b, 0xa1,
+ 0x35, 0x0b, 0x33, 0x63, 0xcd, 0x8e, 0xf4, 0xbe,
+ 0xed, 0xbf, 0x9d
+};
+
+/*
+ * Vector 18
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10111213
+ * CTX 9d84c813f719aa2c7be3f66171c7c5c2edbf9dac
+ * Plaintext length (bytes): 20
+ */
+
+static uint8_t v18_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v18_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v18_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v18_PTX[20] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13
+};
+
+static uint8_t v18_CTX[20] = {
+ 0x9d, 0x84, 0xc8, 0x13, 0xf7, 0x19, 0xaa, 0x2c,
+ 0x7b, 0xe3, 0xf6, 0x61, 0x71, 0xc7, 0xc5, 0xc2,
+ 0xed, 0xbf, 0x9d, 0xac
+};
+
+/*
+ * Vector 19
+ * Key1 e0e1e2e3e4e5e6e7e8e9eaebecedeeef
+ * Key2 c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ * Data unit sequence number 21436587a9
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be6
+ * CTX 8b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42cc
+ * CTX bd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad3854
+ * CTX 9c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a177
+ * CTX 41990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee3
+ * CTX 9936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d4
+ * CTX 8b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe
+ * CTX 41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d
+ * CTX 19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64
+ * CTX a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b88334272
+ * CTX 9e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648
+ * CTX 346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394
+ * CTX d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2
+ * CTX cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a
+ * CTX 5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf
+ * CTX 92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v19_key1[16] = {
+
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef
+};
+
+static uint8_t v19_key2[16] = {
+
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf
+};
+
+static uint8_t v19_TW[16] = {
+
+ 0x21, 0x43, 0x65, 0x87, 0xa9, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v19_PTX[512] = {
+
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v19_CTX[512] = {
+ 0x38, 0xb4, 0x58, 0x12, 0xef, 0x43, 0xa0, 0x5b,
+ 0xd9, 0x57, 0xe5, 0x45, 0x90, 0x7e, 0x22, 0x3b,
+ 0x95, 0x4a, 0xb4, 0xaa, 0xf0, 0x88, 0x30, 0x3a,
+ 0xd9, 0x10, 0xea, 0xdf, 0x14, 0xb4, 0x2b, 0xe6,
+ 0x8b, 0x24, 0x61, 0x14, 0x9d, 0x8c, 0x8b, 0xa8,
+ 0x5f, 0x99, 0x2b, 0xe9, 0x70, 0xbc, 0x62, 0x1f,
+ 0x1b, 0x06, 0x57, 0x3f, 0x63, 0xe8, 0x67, 0xbf,
+ 0x58, 0x75, 0xac, 0xaf, 0xa0, 0x4e, 0x42, 0xcc,
+ 0xbd, 0x7b, 0xd3, 0xc2, 0xa0, 0xfb, 0x1f, 0xff,
+ 0x79, 0x1e, 0xc5, 0xec, 0x36, 0xc6, 0x6a, 0xe4,
+ 0xac, 0x1e, 0x80, 0x6d, 0x81, 0xfb, 0xf7, 0x09,
+ 0xdb, 0xe2, 0x9e, 0x47, 0x1f, 0xad, 0x38, 0x54,
+ 0x9c, 0x8e, 0x66, 0xf5, 0x34, 0x5d, 0x7c, 0x1e,
+ 0xb9, 0x4f, 0x40, 0x5d, 0x1e, 0xc7, 0x85, 0xcc,
+ 0x6f, 0x6a, 0x68, 0xf6, 0x25, 0x4d, 0xd8, 0x33,
+ 0x9f, 0x9d, 0x84, 0x05, 0x7e, 0x01, 0xa1, 0x77,
+ 0x41, 0x99, 0x04, 0x82, 0x99, 0x95, 0x16, 0xb5,
+ 0x61, 0x1a, 0x38, 0xf4, 0x1b, 0xb6, 0x47, 0x8e,
+ 0x6f, 0x17, 0x3f, 0x32, 0x08, 0x05, 0xdd, 0x71,
+ 0xb1, 0x93, 0x2f, 0xc3, 0x33, 0xcb, 0x9e, 0xe3,
+ 0x99, 0x36, 0xbe, 0xea, 0x9a, 0xd9, 0x6f, 0xa1,
+ 0x0f, 0xb4, 0x11, 0x2b, 0x90, 0x17, 0x34, 0xdd,
+ 0xad, 0x40, 0xbc, 0x18, 0x78, 0x99, 0x5f, 0x8e,
+ 0x11, 0xae, 0xe7, 0xd1, 0x41, 0xa2, 0xf5, 0xd4,
+ 0x8b, 0x7a, 0x4e, 0x1e, 0x7f, 0x0b, 0x2c, 0x04,
+ 0x83, 0x0e, 0x69, 0xa4, 0xfd, 0x13, 0x78, 0x41,
+ 0x1c, 0x2f, 0x28, 0x7e, 0xdf, 0x48, 0xc6, 0xc4,
+ 0xe5, 0xc2, 0x47, 0xa1, 0x96, 0x80, 0xf7, 0xfe,
+ 0x41, 0xce, 0xfb, 0xd4, 0x9b, 0x58, 0x21, 0x06,
+ 0xe3, 0x61, 0x6c, 0xbb, 0xe4, 0xdf, 0xb2, 0x34,
+ 0x4b, 0x2a, 0xe9, 0x51, 0x93, 0x91, 0xf3, 0xe0,
+ 0xfb, 0x49, 0x22, 0x25, 0x4b, 0x1d, 0x6d, 0x2d,
+ 0x19, 0xc6, 0xd4, 0xd5, 0x37, 0xb3, 0xa2, 0x6f,
+ 0x3b, 0xcc, 0x51, 0x58, 0x8b, 0x32, 0xf3, 0xec,
+ 0xa0, 0x82, 0x9b, 0x6a, 0x5a, 0xc7, 0x25, 0x78,
+ 0xfb, 0x81, 0x4f, 0xb4, 0x3c, 0xf8, 0x0d, 0x64,
+ 0xa2, 0x33, 0xe3, 0xf9, 0x97, 0xa3, 0xf0, 0x26,
+ 0x83, 0x34, 0x2f, 0x2b, 0x33, 0xd2, 0x5b, 0x49,
+ 0x25, 0x36, 0xb9, 0x3b, 0xec, 0xb2, 0xf5, 0xe1,
+ 0xa8, 0xb8, 0x2f, 0x5b, 0x88, 0x33, 0x42, 0x72,
+ 0x9e, 0x8a, 0xe0, 0x9d, 0x16, 0x93, 0x88, 0x41,
+ 0xa2, 0x1a, 0x97, 0xfb, 0x54, 0x3e, 0xea, 0x3b,
+ 0xbf, 0xf5, 0x9f, 0x13, 0xc1, 0xa1, 0x84, 0x49,
+ 0xe3, 0x98, 0x70, 0x1c, 0x1a, 0xd5, 0x16, 0x48,
+ 0x34, 0x6c, 0xbc, 0x04, 0xc2, 0x7b, 0xb2, 0xda,
+ 0x3b, 0x93, 0xa1, 0x37, 0x2c, 0xca, 0xe5, 0x48,
+ 0xfb, 0x53, 0xbe, 0xe4, 0x76, 0xf9, 0xe9, 0xc9,
+ 0x17, 0x73, 0xb1, 0xbb, 0x19, 0x82, 0x83, 0x94,
+ 0xd5, 0x5d, 0x3e, 0x1a, 0x20, 0xed, 0x69, 0x11,
+ 0x3a, 0x86, 0x0b, 0x68, 0x29, 0xff, 0xa8, 0x47,
+ 0x22, 0x46, 0x04, 0x43, 0x50, 0x70, 0x22, 0x1b,
+ 0x25, 0x7e, 0x8d, 0xff, 0x78, 0x36, 0x15, 0xd2,
+ 0xca, 0xe4, 0x80, 0x3a, 0x93, 0xaa, 0x43, 0x34,
+ 0xab, 0x48, 0x2a, 0x0a, 0xfa, 0xc9, 0xc0, 0xae,
+ 0xda, 0x70, 0xb4, 0x5a, 0x48, 0x1d, 0xf5, 0xde,
+ 0xc5, 0xdf, 0x8c, 0xc0, 0xf4, 0x23, 0xc7, 0x7a,
+ 0x5f, 0xd4, 0x6c, 0xd3, 0x12, 0x02, 0x1d, 0x4b,
+ 0x43, 0x88, 0x62, 0x41, 0x9a, 0x79, 0x1b, 0xe0,
+ 0x3b, 0xb4, 0xd9, 0x7c, 0x0e, 0x59, 0x57, 0x85,
+ 0x42, 0x53, 0x1b, 0xa4, 0x66, 0xa8, 0x3b, 0xaf,
+ 0x92, 0xce, 0xfc, 0x15, 0x1b, 0x5c, 0xc1, 0x61,
+ 0x1a, 0x16, 0x78, 0x93, 0x81, 0x9b, 0x63, 0xfb,
+ 0x8a, 0x6b, 0x18, 0xe8, 0x6d, 0xe6, 0x02, 0x90,
+ 0xfa, 0x72, 0xb7, 0x97, 0xb0, 0xce, 0x59, 0xf3
+};
+
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+ // pointers to the statically defined vectors here
+
+ // Vector 1
+ {sizeof(v1_CTX), v1_key1, v1_key2, v1_TW, v1_PTX, v1_CTX}
+ ,
+ // Vector 2
+ {sizeof(v2_CTX), v2_key1, v2_key2, v2_TW, v2_PTX, v2_CTX}
+ ,
+ // Vector 3
+ {sizeof(v3_CTX), v3_key1, v3_key2, v3_TW, v3_PTX, v3_CTX}
+ ,
+ // Vector 4
+ {sizeof(v4_CTX), v4_key1, v4_key2, v4_TW, v4_PTX, v4_CTX}
+ ,
+ // Vector 5
+ {sizeof(v5_CTX), v5_key1, v5_key2, v5_TW, v5_PTX, v5_CTX}
+ ,
+ // Vector 6
+ {sizeof(v6_CTX), v6_key1, v6_key2, v6_TW, v6_PTX, v6_CTX}
+ ,
+ // Vector 7
+ {sizeof(v7_CTX), v7_key1, v7_key2, v7_TW, v7_PTX, v7_CTX}
+ ,
+ // Vector 8
+ {sizeof(v8_CTX), v8_key1, v8_key2, v8_TW, v8_PTX, v8_CTX}
+ ,
+ // Vector 9
+ {sizeof(v9_CTX), v9_key1, v9_key2, v9_TW, v9_PTX, v9_CTX}
+ ,
+ // Vector 15
+ {sizeof(v15_CTX), v15_key1, v15_key2, v15_TW, v15_PTX, v15_CTX}
+ ,
+ // Vector 16
+ {sizeof(v16_CTX), v16_key1, v16_key2, v16_TW, v16_PTX, v16_CTX}
+ ,
+ // Vector 17
+ {sizeof(v17_CTX), v17_key1, v17_key2, v17_TW, v17_PTX, v17_CTX}
+ ,
+ // Vector 18
+ {sizeof(v18_CTX), v18_key1, v18_key2, v18_TW, v18_PTX, v18_CTX}
+ ,
+ // Vector 19
+ {sizeof(v19_CTX), v19_key1, v19_key2, v19_TW, v19_PTX, v19_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
new file mode 100644
index 00000000..a3b3fc12
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt, *refdt;
+ struct perf start, stop;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_256_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 32; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 32] = key2[i];
+ }
+
+ /* Encrypt and compare decrypted output */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+ if (memcmp(dt, refdt, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ perf_stop(&stop);
+ printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+ perf_stop(&stop);
+ printf("aes_xts_256_openssl_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
new file mode 100644
index 00000000..30dce695
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
@@ -0,0 +1,103 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ printf("aes_xts_256_dec_perf:\n");
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
new file mode 100644
index 00000000..6431c27c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *refct;
+ struct perf start, stop;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_256_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == refct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 32; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 32] = key2[i];
+ }
+
+ /* Encrypt and compare output */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ if (memcmp(ct, refct, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ perf_stop(&stop);
+
+ printf("aes_xts_256_ossl_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
new file mode 100644
index 00000000..ac536e21
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
@@ -0,0 +1,101 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct;
+
+ printf("aes_xts_256_enc_perf:\n");
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
new file mode 100644
index 00000000..8278a226
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
@@ -0,0 +1,113 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+ // Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+ // needed for the tweak part of the decryption)
+ uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+ uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ printf("Can't allocate ciphertext memory\n");
+ return -1;
+ }
+ // Pre-expand our keys (will only use the encryption ones here)
+ aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+ XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated plaintext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ printf("Can't allocate plaintext memory\n");
+ return -1;
+ }
+ // Pre-expand keys for the decryption
+ aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+ // Note, encryption key is re-used for the tweak decryption step
+ XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
new file mode 100644
index 00000000..bfa10b6b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int t, n;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ int align, size, min_size;
+ unsigned char *efence_pt;
+ unsigned char *efence_ct;
+ unsigned char *efence_dt;
+
+ unsigned char *origin_pt;
+ unsigned char *origin_ct;
+ unsigned char *origin_dt;
+
+ unsigned char key1_exp_enc[16 * 15], key1_exp_dec[16 * 15];
+ unsigned char key2_exp_tw[16 * 15];
+ int i;
+ printf("aes_xts_256 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ if (memcmp(pt, dt, TEST_LEN)) {
+ printf("fail\n");
+ return -1;
+ }
+ putchar('.');
+
+ // Do tests with random data, keys and message size
+ for (t = 0; t < RANDOMS; t++) {
+ n = rand() % (TEST_LEN);
+ if (n < 17)
+ continue;
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, n);
+ XTS_AES_256_enc(key2, key1, tinit, n, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, n, ct, dt);
+
+ if (memcmp(pt, dt, n)) {
+ printf("fail rand %d, size %d\n", t, n);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // Run tests at end of buffer for Electric Fence
+ align = 1;
+ min_size = 16;
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ origin_pt = malloc(TEST_LEN);
+ origin_ct = malloc(TEST_LEN);
+ origin_dt = malloc(TEST_LEN);
+ if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ for (i = 0; i < 16 * 15; i++) {
+ key2_exp_tw[i] = rand();
+ }
+
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
new file mode 100644
index 00000000..99308b37
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
@@ -0,0 +1,209 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_LOOPS 100
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+
+/* Generates random data for keys, tweak and plaintext */
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 encryption */
+static inline
+ int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 decryption */
+static inline
+ int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+
+ unsigned char key1[32], key2[32], tinit[16];
+ unsigned char *pt, *ct, *dt, *refct, *refdt;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+ int i, j, k;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ /* Allocate space for input and output buffers */
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /**************************** FIXED LENGTH TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+ /* Loop over the vectors */
+ for (i = 0; i < TEST_LOOPS; i++) {
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_256_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, TEST_LEN, refct, refdt);
+
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_256_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ /**************************** RANDOM LENGTH TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+ /* Run tests with random size */
+
+ unsigned int rand_len, t;
+
+ for (t = 0; t < RANDOMS; t++) {
+
+ rand_len = rand() % (TEST_LEN);
+ xts256_mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, rand_len, pt, ct);
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, rand_len, pt, refct);
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < rand_len; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_256_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, rand_len, ct, dt);
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, rand_len, refct, refdt);
+
+ for (j = 0; j < rand_len; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_256_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ printf("aes_xts_256_rand_ossl: All tests passed\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
new file mode 100644
index 00000000..fc349524
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
@@ -0,0 +1,105 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return -1;
+ }
+
+ XTS_AES_256_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+
+ ct_test = NULL;
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return -1;
+ }
+
+ XTS_AES_256_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+
+ pt_test = NULL;
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
new file mode 100644
index 00000000..640d1ddf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
@@ -0,0 +1,1035 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 5
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+ uint64_t ptlen; // length of our plaintext
+ uint8_t *key1; // dimension 16 for 128 bit aes
+ uint8_t *key2; // dimension 16 for 128 bit aes
+ uint8_t *TW; // dimension 16 for both 128 and 256 bit
+ uint8_t *PTX; // min. dimension 16
+ uint8_t *CTX; // same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 10
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b
+ * CTX 5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd
+ * CTX 5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0
+ * CTX c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca
+ * CTX 2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0
+ * CTX b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f
+ * CTX 93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec
+ * CTX 583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a
+ * CTX 84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1
+ * CTX 505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae
+ * CTX 9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29
+ * CTX a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac
+ * CTX 6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f
+ * CTX 645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed4385
+ * CTX 1ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa
+ * CTX 773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v10_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v10_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v10_TW[16] = {
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v10_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v10_CTX[512] = {
+ 0x1c, 0x3b, 0x3a, 0x10, 0x2f, 0x77, 0x03, 0x86,
+ 0xe4, 0x83, 0x6c, 0x99, 0xe3, 0x70, 0xcf, 0x9b,
+ 0xea, 0x00, 0x80, 0x3f, 0x5e, 0x48, 0x23, 0x57,
+ 0xa4, 0xae, 0x12, 0xd4, 0x14, 0xa3, 0xe6, 0x3b,
+ 0x5d, 0x31, 0xe2, 0x76, 0xf8, 0xfe, 0x4a, 0x8d,
+ 0x66, 0xb3, 0x17, 0xf9, 0xac, 0x68, 0x3f, 0x44,
+ 0x68, 0x0a, 0x86, 0xac, 0x35, 0xad, 0xfc, 0x33,
+ 0x45, 0xbe, 0xfe, 0xcb, 0x4b, 0xb1, 0x88, 0xfd,
+ 0x57, 0x76, 0x92, 0x6c, 0x49, 0xa3, 0x09, 0x5e,
+ 0xb1, 0x08, 0xfd, 0x10, 0x98, 0xba, 0xec, 0x70,
+ 0xaa, 0xa6, 0x69, 0x99, 0xa7, 0x2a, 0x82, 0xf2,
+ 0x7d, 0x84, 0x8b, 0x21, 0xd4, 0xa7, 0x41, 0xb0,
+ 0xc5, 0xcd, 0x4d, 0x5f, 0xff, 0x9d, 0xac, 0x89,
+ 0xae, 0xba, 0x12, 0x29, 0x61, 0xd0, 0x3a, 0x75,
+ 0x71, 0x23, 0xe9, 0x87, 0x0f, 0x8a, 0xcf, 0x10,
+ 0x00, 0x02, 0x08, 0x87, 0x89, 0x14, 0x29, 0xca,
+ 0x2a, 0x3e, 0x7a, 0x7d, 0x7d, 0xf7, 0xb1, 0x03,
+ 0x55, 0x16, 0x5c, 0x8b, 0x9a, 0x6d, 0x0a, 0x7d,
+ 0xe8, 0xb0, 0x62, 0xc4, 0x50, 0x0d, 0xc4, 0xcd,
+ 0x12, 0x0c, 0x0f, 0x74, 0x18, 0xda, 0xe3, 0xd0,
+ 0xb5, 0x78, 0x1c, 0x34, 0x80, 0x3f, 0xa7, 0x54,
+ 0x21, 0xc7, 0x90, 0xdf, 0xe1, 0xde, 0x18, 0x34,
+ 0xf2, 0x80, 0xd7, 0x66, 0x7b, 0x32, 0x7f, 0x6c,
+ 0x8c, 0xd7, 0x55, 0x7e, 0x12, 0xac, 0x3a, 0x0f,
+ 0x93, 0xec, 0x05, 0xc5, 0x2e, 0x04, 0x93, 0xef,
+ 0x31, 0xa1, 0x2d, 0x3d, 0x92, 0x60, 0xf7, 0x9a,
+ 0x28, 0x9d, 0x6a, 0x37, 0x9b, 0xc7, 0x0c, 0x50,
+ 0x84, 0x14, 0x73, 0xd1, 0xa8, 0xcc, 0x81, 0xec,
+ 0x58, 0x3e, 0x96, 0x45, 0xe0, 0x7b, 0x8d, 0x96,
+ 0x70, 0x65, 0x5b, 0xa5, 0xbb, 0xcf, 0xec, 0xc6,
+ 0xdc, 0x39, 0x66, 0x38, 0x0a, 0xd8, 0xfe, 0xcb,
+ 0x17, 0xb6, 0xba, 0x02, 0x46, 0x9a, 0x02, 0x0a,
+ 0x84, 0xe1, 0x8e, 0x8f, 0x84, 0x25, 0x20, 0x70,
+ 0xc1, 0x3e, 0x9f, 0x1f, 0x28, 0x9b, 0xe5, 0x4f,
+ 0xbc, 0x48, 0x14, 0x57, 0x77, 0x8f, 0x61, 0x60,
+ 0x15, 0xe1, 0x32, 0x7a, 0x02, 0xb1, 0x40, 0xf1,
+ 0x50, 0x5e, 0xb3, 0x09, 0x32, 0x6d, 0x68, 0x37,
+ 0x8f, 0x83, 0x74, 0x59, 0x5c, 0x84, 0x9d, 0x84,
+ 0xf4, 0xc3, 0x33, 0xec, 0x44, 0x23, 0x88, 0x51,
+ 0x43, 0xcb, 0x47, 0xbd, 0x71, 0xc5, 0xed, 0xae,
+ 0x9b, 0xe6, 0x9a, 0x2f, 0xfe, 0xce, 0xb1, 0xbe,
+ 0xc9, 0xde, 0x24, 0x4f, 0xbe, 0x15, 0x99, 0x2b,
+ 0x11, 0xb7, 0x7c, 0x04, 0x0f, 0x12, 0xbd, 0x8f,
+ 0x6a, 0x97, 0x5a, 0x44, 0xa0, 0xf9, 0x0c, 0x29,
+ 0xa9, 0xab, 0xc3, 0xd4, 0xd8, 0x93, 0x92, 0x72,
+ 0x84, 0xc5, 0x87, 0x54, 0xcc, 0xe2, 0x94, 0x52,
+ 0x9f, 0x86, 0x14, 0xdc, 0xd2, 0xab, 0xa9, 0x91,
+ 0x92, 0x5f, 0xed, 0xc4, 0xae, 0x74, 0xff, 0xac,
+ 0x6e, 0x33, 0x3b, 0x93, 0xeb, 0x4a, 0xff, 0x04,
+ 0x79, 0xda, 0x9a, 0x41, 0x0e, 0x44, 0x50, 0xe0,
+ 0xdd, 0x7a, 0xe4, 0xc6, 0xe2, 0x91, 0x09, 0x00,
+ 0x57, 0x5d, 0xa4, 0x01, 0xfc, 0x07, 0x05, 0x9f,
+ 0x64, 0x5e, 0x8b, 0x7e, 0x9b, 0xfd, 0xef, 0x33,
+ 0x94, 0x30, 0x54, 0xff, 0x84, 0x01, 0x14, 0x93,
+ 0xc2, 0x7b, 0x34, 0x29, 0xea, 0xed, 0xb4, 0xed,
+ 0x53, 0x76, 0x44, 0x1a, 0x77, 0xed, 0x43, 0x85,
+ 0x1a, 0xd7, 0x7f, 0x16, 0xf5, 0x41, 0xdf, 0xd2,
+ 0x69, 0xd5, 0x0d, 0x6a, 0x5f, 0x14, 0xfb, 0x0a,
+ 0xab, 0x1c, 0xbb, 0x4c, 0x15, 0x50, 0xbe, 0x97,
+ 0xf7, 0xab, 0x40, 0x66, 0x19, 0x3c, 0x4c, 0xaa,
+ 0x77, 0x3d, 0xad, 0x38, 0x01, 0x4b, 0xd2, 0x09,
+ 0x2f, 0xa7, 0x55, 0xc8, 0x24, 0xbb, 0x5e, 0x54,
+ 0xc4, 0xf3, 0x6f, 0xfd, 0xa9, 0xfc, 0xea, 0x70,
+ 0xb9, 0xc6, 0xe6, 0x93, 0xe1, 0x48, 0xc1, 0x51
+};
+
+/*
+ * Vector 11
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11
+ * CTX e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2
+ * CTX 469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d
+ * CTX 3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b201
+ * CTX 3fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca179
+ * CTX 8d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922a
+ * CTX aa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c6
+ * CTX 9aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80
+ * CTX 470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888a
+ * CTX bf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab0
+ * CTX 18bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75
+ * CTX bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63
+ * CTX 094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a
+ * CTX 7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca68
+ * CTX 6e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc090
+ * CTX 54fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803
+ * Plaintext length (bytes): 512
+ *
+*/
+static uint8_t v11_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v11_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v11_TW[16] = {
+ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v11_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v11_CTX[512] = {
+ 0x77, 0xa3, 0x12, 0x51, 0x61, 0x8a, 0x15, 0xe6,
+ 0xb9, 0x2d, 0x1d, 0x66, 0xdf, 0xfe, 0x7b, 0x50,
+ 0xb5, 0x0b, 0xad, 0x55, 0x23, 0x05, 0xba, 0x02,
+ 0x17, 0xa6, 0x10, 0x68, 0x8e, 0xff, 0x7e, 0x11,
+ 0xe1, 0xd0, 0x22, 0x54, 0x38, 0xe0, 0x93, 0x24,
+ 0x2d, 0x6d, 0xb2, 0x74, 0xfd, 0xe8, 0x01, 0xd4,
+ 0xca, 0xe0, 0x6f, 0x20, 0x92, 0xc7, 0x28, 0xb2,
+ 0x47, 0x85, 0x59, 0xdf, 0x58, 0xe8, 0x37, 0xc2,
+ 0x46, 0x9e, 0xe4, 0xa4, 0xfa, 0x79, 0x4e, 0x4b,
+ 0xbc, 0x7f, 0x39, 0xbc, 0x02, 0x6e, 0x3c, 0xb7,
+ 0x2c, 0x33, 0xb0, 0x88, 0x8f, 0x25, 0xb4, 0xac,
+ 0xf5, 0x6a, 0x2a, 0x98, 0x04, 0xf1, 0xce, 0x6d,
+ 0x3d, 0x6e, 0x1d, 0xc6, 0xca, 0x18, 0x1d, 0x4b,
+ 0x54, 0x61, 0x79, 0xd5, 0x55, 0x44, 0xaa, 0x77,
+ 0x60, 0xc4, 0x0d, 0x06, 0x74, 0x15, 0x39, 0xc7,
+ 0xe3, 0xcd, 0x9d, 0x2f, 0x66, 0x50, 0xb2, 0x01,
+ 0x3f, 0xd0, 0xee, 0xb8, 0xc2, 0xb8, 0xe3, 0xd8,
+ 0xd2, 0x40, 0xcc, 0xae, 0x2d, 0x4c, 0x98, 0x32,
+ 0x0a, 0x74, 0x42, 0xe1, 0xc8, 0xd7, 0x5a, 0x42,
+ 0xd6, 0xe6, 0xcf, 0xa4, 0xc2, 0xec, 0xa1, 0x79,
+ 0x8d, 0x15, 0x8c, 0x7a, 0xec, 0xdf, 0x82, 0x49,
+ 0x0f, 0x24, 0xbb, 0x9b, 0x38, 0xe1, 0x08, 0xbc,
+ 0xda, 0x12, 0xc3, 0xfa, 0xf9, 0xa2, 0x11, 0x41,
+ 0xc3, 0x61, 0x3b, 0x58, 0x36, 0x7f, 0x92, 0x2a,
+ 0xaa, 0x26, 0xcd, 0x22, 0xf2, 0x3d, 0x70, 0x8d,
+ 0xae, 0x69, 0x9a, 0xd7, 0xcb, 0x40, 0xa8, 0xad,
+ 0x0b, 0x6e, 0x27, 0x84, 0x97, 0x3d, 0xcb, 0x60,
+ 0x56, 0x84, 0xc0, 0x8b, 0x8d, 0x69, 0x98, 0xc6,
+ 0x9a, 0xac, 0x04, 0x99, 0x21, 0x87, 0x1e, 0xbb,
+ 0x65, 0x30, 0x1a, 0x46, 0x19, 0xca, 0x80, 0xec,
+ 0xb4, 0x85, 0xa3, 0x1d, 0x74, 0x42, 0x23, 0xce,
+ 0x8d, 0xdc, 0x23, 0x94, 0x82, 0x8d, 0x6a, 0x80,
+ 0x47, 0x0c, 0x09, 0x2f, 0x5b, 0xa4, 0x13, 0xc3,
+ 0x37, 0x8f, 0xa6, 0x05, 0x42, 0x55, 0xc6, 0xf9,
+ 0xdf, 0x44, 0x95, 0x86, 0x2b, 0xbb, 0x32, 0x87,
+ 0x68, 0x1f, 0x93, 0x1b, 0x68, 0x7c, 0x88, 0x8a,
+ 0xbf, 0x84, 0x4d, 0xfc, 0x8f, 0xc2, 0x83, 0x31,
+ 0xe5, 0x79, 0x92, 0x8c, 0xd1, 0x2b, 0xd2, 0x39,
+ 0x0a, 0xe1, 0x23, 0xcf, 0x03, 0x81, 0x8d, 0x14,
+ 0xde, 0xdd, 0xe5, 0xc0, 0xc2, 0x4c, 0x8a, 0xb0,
+ 0x18, 0xbf, 0xca, 0x75, 0xca, 0x09, 0x6f, 0x2d,
+ 0x53, 0x1f, 0x3d, 0x16, 0x19, 0xe7, 0x85, 0xf1,
+ 0xad, 0xa4, 0x37, 0xca, 0xb9, 0x2e, 0x98, 0x05,
+ 0x58, 0xb3, 0xdc, 0xe1, 0x47, 0x4a, 0xfb, 0x75,
+ 0xbf, 0xed, 0xbf, 0x8f, 0xf5, 0x4c, 0xb2, 0x61,
+ 0x8e, 0x02, 0x44, 0xc9, 0xac, 0x0d, 0x3c, 0x66,
+ 0xfb, 0x51, 0x59, 0x8c, 0xd2, 0xdb, 0x11, 0xf9,
+ 0xbe, 0x39, 0x79, 0x1a, 0xbe, 0x44, 0x7c, 0x63,
+ 0x09, 0x4f, 0x7c, 0x45, 0x3b, 0x7f, 0xf8, 0x7c,
+ 0xb5, 0xbb, 0x36, 0xb7, 0xc7, 0x9e, 0xfb, 0x08,
+ 0x72, 0xd1, 0x70, 0x58, 0xb8, 0x3b, 0x15, 0xab,
+ 0x08, 0x66, 0xad, 0x8a, 0x58, 0x65, 0x6c, 0x5a,
+ 0x7e, 0x20, 0xdb, 0xdf, 0x30, 0x8b, 0x24, 0x61,
+ 0xd9, 0x7c, 0x0e, 0xc0, 0x02, 0x4a, 0x27, 0x15,
+ 0x05, 0x52, 0x49, 0xcf, 0x3b, 0x47, 0x8d, 0xdd,
+ 0x47, 0x40, 0xde, 0x65, 0x4f, 0x75, 0xca, 0x68,
+ 0x6e, 0x0d, 0x73, 0x45, 0xc6, 0x9e, 0xd5, 0x0c,
+ 0xdc, 0x2a, 0x8b, 0x33, 0x2b, 0x1f, 0x88, 0x24,
+ 0x10, 0x8a, 0xc9, 0x37, 0xeb, 0x05, 0x05, 0x85,
+ 0x60, 0x8e, 0xe7, 0x34, 0x09, 0x7f, 0xc0, 0x90,
+ 0x54, 0xfb, 0xff, 0x89, 0xee, 0xae, 0xea, 0x79,
+ 0x1f, 0x4a, 0x7a, 0xb1, 0xf9, 0x86, 0x82, 0x94,
+ 0xa4, 0xf9, 0xe2, 0x7b, 0x42, 0xaf, 0x81, 0x00,
+ 0xcb, 0x9d, 0x59, 0xce, 0xf9, 0x64, 0x58, 0x03
+};
+
+/*
+ * Vector 12
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200
+ * CTX a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba049
+ * CTX 9298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a8
+ * CTX 7815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e1
+ * CTX 9110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf
+ * CTX 5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e
+ * CTX 62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6
+ * CTX e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879
+ * CTX a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe0
+ * CTX 97b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964
+ * CTX c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799d
+ * CTX a3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7a
+ * CTX f08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2
+ * CTX 480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace
+ * CTX 231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36
+ * CTX ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v12_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v12_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v12_TW[16] = {
+ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v12_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v12_CTX[512] = {
+ 0xe3, 0x87, 0xaa, 0xa5, 0x8b, 0xa4, 0x83, 0xaf,
+ 0xa7, 0xe8, 0xeb, 0x46, 0x97, 0x78, 0x31, 0x7e,
+ 0xcf, 0x4c, 0xf5, 0x73, 0xaa, 0x9d, 0x4e, 0xac,
+ 0x23, 0xf2, 0xcd, 0xf9, 0x14, 0xe4, 0xe2, 0x00,
+ 0xa8, 0xb4, 0x90, 0xe4, 0x2e, 0xe6, 0x46, 0x80,
+ 0x2d, 0xc6, 0xee, 0x2b, 0x47, 0x1b, 0x27, 0x81,
+ 0x95, 0xd6, 0x09, 0x18, 0xec, 0xec, 0xb4, 0x4b,
+ 0xf7, 0x99, 0x66, 0xf8, 0x3f, 0xab, 0xa0, 0x49,
+ 0x92, 0x98, 0xeb, 0xc6, 0x99, 0xc0, 0xc8, 0x63,
+ 0x47, 0x15, 0xa3, 0x20, 0xbb, 0x4f, 0x07, 0x5d,
+ 0x62, 0x2e, 0x74, 0xc8, 0xc9, 0x32, 0x00, 0x4f,
+ 0x25, 0xb4, 0x1e, 0x36, 0x10, 0x25, 0xb5, 0xa8,
+ 0x78, 0x15, 0x39, 0x1f, 0x61, 0x08, 0xfc, 0x4a,
+ 0xfa, 0x6a, 0x05, 0xd9, 0x30, 0x3c, 0x6b, 0xa6,
+ 0x8a, 0x12, 0x8a, 0x55, 0x70, 0x5d, 0x41, 0x59,
+ 0x85, 0x83, 0x2f, 0xde, 0xaa, 0xe6, 0xc8, 0xe1,
+ 0x91, 0x10, 0xe8, 0x4d, 0x1b, 0x1f, 0x19, 0x9a,
+ 0x26, 0x92, 0x11, 0x9e, 0xdc, 0x96, 0x13, 0x26,
+ 0x58, 0xf0, 0x9d, 0xa7, 0xc6, 0x23, 0xef, 0xce,
+ 0xc7, 0x12, 0x53, 0x7a, 0x3d, 0x94, 0xc0, 0xbf,
+ 0x5d, 0x7e, 0x35, 0x2e, 0xc9, 0x4a, 0xe5, 0x79,
+ 0x7f, 0xdb, 0x37, 0x7d, 0xc1, 0x55, 0x11, 0x50,
+ 0x72, 0x1a, 0xdf, 0x15, 0xbd, 0x26, 0xa8, 0xef,
+ 0xc2, 0xfc, 0xaa, 0xd5, 0x68, 0x81, 0xfa, 0x9e,
+ 0x62, 0x46, 0x2c, 0x28, 0xf3, 0x0a, 0xe1, 0xce,
+ 0xac, 0xa9, 0x3c, 0x34, 0x5c, 0xf2, 0x43, 0xb7,
+ 0x3f, 0x54, 0x2e, 0x20, 0x74, 0xa7, 0x05, 0xbd,
+ 0x26, 0x43, 0xbb, 0x9f, 0x7c, 0xc7, 0x9b, 0xb6,
+ 0xe7, 0x09, 0x1e, 0xa6, 0xe2, 0x32, 0xdf, 0x0f,
+ 0x9a, 0xd0, 0xd6, 0xcf, 0x50, 0x23, 0x27, 0x87,
+ 0x6d, 0x82, 0x20, 0x7a, 0xbf, 0x21, 0x15, 0xcd,
+ 0xac, 0xf6, 0xd5, 0xa4, 0x8f, 0x6c, 0x18, 0x79,
+ 0xa6, 0x5b, 0x11, 0x5f, 0x0f, 0x8b, 0x3c, 0xb3,
+ 0xc5, 0x9d, 0x15, 0xdd, 0x8c, 0x76, 0x9b, 0xc0,
+ 0x14, 0x79, 0x5a, 0x18, 0x37, 0xf3, 0x90, 0x1b,
+ 0x58, 0x45, 0xeb, 0x49, 0x1a, 0xdf, 0xef, 0xe0,
+ 0x97, 0xb1, 0xfa, 0x30, 0xa1, 0x2f, 0xc1, 0xf6,
+ 0x5b, 0xa2, 0x29, 0x05, 0x03, 0x15, 0x39, 0x97,
+ 0x1a, 0x10, 0xf2, 0xf3, 0x6c, 0x32, 0x1b, 0xb5,
+ 0x13, 0x31, 0xcd, 0xef, 0xb3, 0x9e, 0x39, 0x64,
+ 0xc7, 0xef, 0x07, 0x99, 0x94, 0xf5, 0xb6, 0x9b,
+ 0x2e, 0xdd, 0x83, 0xa7, 0x1e, 0xf5, 0x49, 0x97,
+ 0x1e, 0xe9, 0x3f, 0x44, 0xea, 0xc3, 0x93, 0x8f,
+ 0xcd, 0xd6, 0x1d, 0x01, 0xfa, 0x71, 0x79, 0x9d,
+ 0xa3, 0xa8, 0x09, 0x1c, 0x4c, 0x48, 0xaa, 0x9e,
+ 0xd2, 0x63, 0xff, 0x07, 0x49, 0xdf, 0x95, 0xd4,
+ 0x4f, 0xef, 0x6a, 0x0b, 0xb5, 0x78, 0xec, 0x69,
+ 0x45, 0x6a, 0xa5, 0x40, 0x8a, 0xe3, 0x2c, 0x7a,
+ 0xf0, 0x8a, 0xd7, 0xba, 0x89, 0x21, 0x28, 0x7e,
+ 0x3b, 0xbe, 0xe3, 0x1b, 0x76, 0x7b, 0xe0, 0x6a,
+ 0x0e, 0x70, 0x5c, 0x86, 0x4a, 0x76, 0x91, 0x37,
+ 0xdf, 0x28, 0x29, 0x22, 0x83, 0xea, 0x81, 0xa2,
+ 0x48, 0x02, 0x41, 0xb4, 0x4d, 0x99, 0x21, 0xcd,
+ 0xbe, 0xc1, 0xbc, 0x28, 0xdc, 0x1f, 0xda, 0x11,
+ 0x4b, 0xd8, 0xe5, 0x21, 0x7a, 0xc9, 0xd8, 0xeb,
+ 0xaf, 0xa7, 0x20, 0xe9, 0xda, 0x4f, 0x9a, 0xce,
+ 0x23, 0x1c, 0xc9, 0x49, 0xe5, 0xb9, 0x6f, 0xe7,
+ 0x6f, 0xfc, 0x21, 0x06, 0x3f, 0xdd, 0xc8, 0x3a,
+ 0x6b, 0x86, 0x79, 0xc0, 0x0d, 0x35, 0xe0, 0x95,
+ 0x76, 0xa8, 0x75, 0x30, 0x5b, 0xed, 0x5f, 0x36,
+ 0xed, 0x24, 0x2c, 0x89, 0x00, 0xdd, 0x1f, 0xa9,
+ 0x65, 0xbc, 0x95, 0x0d, 0xfc, 0xe0, 0x9b, 0x13,
+ 0x22, 0x63, 0xa1, 0xee, 0xf5, 0x2d, 0xd6, 0x88,
+ 0x8c, 0x30, 0x9f, 0x5a, 0x7d, 0x71, 0x28, 0x26
+};
+
+/*
+ * Vector 13
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5
+ * CTX 520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1
+ * CTX c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac
+ * CTX 7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e
+ * CTX 919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d3
+ * CTX 50875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f5
+ * CTX 69768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed9
+ * CTX 0fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c
+ * CTX 1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053
+ * CTX 027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e7
+ * CTX 5d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df
+ * CTX 73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc00
+ * CTX 77ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460
+ * CTX d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a4155
+ * CTX 6cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88c
+ * CTX f6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v13_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v13_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v13_TW[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v13_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v13_CTX[512] = {
+ 0xbf, 0x53, 0xd2, 0xda, 0xde, 0x78, 0xe8, 0x22,
+ 0xa4, 0xd9, 0x49, 0xa9, 0xbc, 0x67, 0x66, 0xb0,
+ 0x1b, 0x06, 0xa8, 0xef, 0x70, 0xd2, 0x67, 0x48,
+ 0xc6, 0xa7, 0xfc, 0x36, 0xd8, 0x0a, 0xe4, 0xc5,
+ 0x52, 0x0f, 0x7c, 0x4a, 0xb0, 0xac, 0x85, 0x44,
+ 0x42, 0x4f, 0xa4, 0x05, 0x16, 0x2f, 0xef, 0x5a,
+ 0x6b, 0x7f, 0x22, 0x94, 0x98, 0x06, 0x36, 0x18,
+ 0xd3, 0x9f, 0x00, 0x03, 0xcb, 0x5f, 0xb8, 0xd1,
+ 0xc8, 0x6b, 0x64, 0x34, 0x97, 0xda, 0x1f, 0xf9,
+ 0x45, 0xc8, 0xd3, 0xbe, 0xde, 0xca, 0x4f, 0x47,
+ 0x97, 0x02, 0xa7, 0xa7, 0x35, 0xf0, 0x43, 0xdd,
+ 0xb1, 0xd6, 0xaa, 0xad, 0xe3, 0xc4, 0xa0, 0xac,
+ 0x7c, 0xa7, 0xf3, 0xfa, 0x52, 0x79, 0xbe, 0xf5,
+ 0x6f, 0x82, 0xcd, 0x7a, 0x2f, 0x38, 0x67, 0x2e,
+ 0x82, 0x48, 0x14, 0xe1, 0x07, 0x00, 0x30, 0x0a,
+ 0x05, 0x5e, 0x16, 0x30, 0xb8, 0xf1, 0xcb, 0x0e,
+ 0x91, 0x9f, 0x5e, 0x94, 0x20, 0x10, 0xa4, 0x16,
+ 0xe2, 0xbf, 0x48, 0xcb, 0x46, 0x99, 0x3d, 0x3c,
+ 0xb6, 0xa5, 0x1c, 0x19, 0xba, 0xcf, 0x86, 0x47,
+ 0x85, 0xa0, 0x0b, 0xc2, 0xec, 0xff, 0x15, 0xd3,
+ 0x50, 0x87, 0x5b, 0x24, 0x6e, 0xd5, 0x3e, 0x68,
+ 0xbe, 0x6f, 0x55, 0xbd, 0x7e, 0x05, 0xcf, 0xc2,
+ 0xb2, 0xed, 0x64, 0x32, 0x19, 0x8a, 0x64, 0x44,
+ 0xb6, 0xd8, 0xc2, 0x47, 0xfa, 0xb9, 0x41, 0xf5,
+ 0x69, 0x76, 0x8b, 0x5c, 0x42, 0x93, 0x66, 0xf1,
+ 0xd3, 0xf0, 0x0f, 0x03, 0x45, 0xb9, 0x61, 0x23,
+ 0xd5, 0x62, 0x04, 0xc0, 0x1c, 0x63, 0xb2, 0x2c,
+ 0xe7, 0x8b, 0xaf, 0x11, 0x6e, 0x52, 0x5e, 0xd9,
+ 0x0f, 0xde, 0xa3, 0x9f, 0xa4, 0x69, 0x49, 0x4d,
+ 0x38, 0x66, 0xc3, 0x1e, 0x05, 0xf2, 0x95, 0xff,
+ 0x21, 0xfe, 0xa8, 0xd4, 0xe6, 0xe1, 0x3d, 0x67,
+ 0xe4, 0x7c, 0xe7, 0x22, 0xe9, 0x69, 0x8a, 0x1c,
+ 0x10, 0x48, 0xd6, 0x8e, 0xbc, 0xde, 0x76, 0xb8,
+ 0x6f, 0xcf, 0x97, 0x6e, 0xab, 0x8a, 0xa9, 0x79,
+ 0x02, 0x68, 0xb7, 0x06, 0x8e, 0x01, 0x7a, 0x8b,
+ 0x9b, 0x74, 0x94, 0x09, 0x51, 0x4f, 0x10, 0x53,
+ 0x02, 0x7f, 0xd1, 0x6c, 0x37, 0x86, 0xea, 0x1b,
+ 0xac, 0x5f, 0x15, 0xcb, 0x79, 0x71, 0x1e, 0xe2,
+ 0xab, 0xe8, 0x2f, 0x5c, 0xf8, 0xb1, 0x3a, 0xe7,
+ 0x30, 0x30, 0xef, 0x5b, 0x9e, 0x44, 0x57, 0xe7,
+ 0x5d, 0x13, 0x04, 0xf9, 0x88, 0xd6, 0x2d, 0xd6,
+ 0xfc, 0x4b, 0x94, 0xed, 0x38, 0xba, 0x83, 0x1d,
+ 0xa4, 0xb7, 0x63, 0x49, 0x71, 0xb6, 0xcd, 0x8e,
+ 0xc3, 0x25, 0xd9, 0xc6, 0x1c, 0x00, 0xf1, 0xdf,
+ 0x73, 0x62, 0x7e, 0xd3, 0x74, 0x5a, 0x5e, 0x84,
+ 0x89, 0xf3, 0xa9, 0x5c, 0x69, 0x63, 0x9c, 0x32,
+ 0xcd, 0x6e, 0x1d, 0x53, 0x7a, 0x85, 0xf7, 0x5c,
+ 0xc8, 0x44, 0x72, 0x6e, 0x8a, 0x72, 0xfc, 0x00,
+ 0x77, 0xad, 0x22, 0x00, 0x0f, 0x1d, 0x50, 0x78,
+ 0xf6, 0xb8, 0x66, 0x31, 0x8c, 0x66, 0x8f, 0x1a,
+ 0xd0, 0x3d, 0x5a, 0x5f, 0xce, 0xd5, 0x21, 0x9f,
+ 0x2e, 0xab, 0xbd, 0x0a, 0xa5, 0xc0, 0xf4, 0x60,
+ 0xd1, 0x83, 0xf0, 0x44, 0x04, 0xa0, 0xd6, 0xf4,
+ 0x69, 0x55, 0x8e, 0x81, 0xfa, 0xb2, 0x4a, 0x16,
+ 0x79, 0x05, 0xab, 0x4c, 0x78, 0x78, 0x50, 0x2a,
+ 0xd3, 0xe3, 0x8f, 0xdb, 0xe6, 0x2a, 0x41, 0x55,
+ 0x6c, 0xec, 0x37, 0x32, 0x57, 0x59, 0x53, 0x3c,
+ 0xe8, 0xf2, 0x5f, 0x36, 0x7c, 0x87, 0xbb, 0x55,
+ 0x78, 0xd6, 0x67, 0xae, 0x93, 0xf9, 0xe2, 0xfd,
+ 0x99, 0xbc, 0xbc, 0x5f, 0x2f, 0xbb, 0xa8, 0x8c,
+ 0xf6, 0x51, 0x61, 0x39, 0x42, 0x0f, 0xcf, 0xf3,
+ 0xb7, 0x36, 0x1d, 0x86, 0x32, 0x2c, 0x4b, 0xd8,
+ 0x4c, 0x82, 0xf3, 0x35, 0xab, 0xb1, 0x52, 0xc4,
+ 0xa9, 0x34, 0x11, 0x37, 0x3a, 0xaa, 0x82, 0x20
+};
+
+/*
+ * Vector 14
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed23
+ * CTX 50eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b80
+ * CTX 8da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604
+ * CTX e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4
+ * CTX a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b
+ * CTX 50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434
+ * CTX 631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d8
+ * CTX 76c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff
+ * CTX 3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41b
+ * CTX f6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f6
+ * CTX 85b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa
+ * CTX 6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abc
+ * CTX e19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a
+ * CTX 0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd9272120546
+ * CTX 81d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de
+ * CTX 6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v14_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v14_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v14_TW[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v14_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v14_CTX[512] = {
+ 0x64, 0x49, 0x7e, 0x5a, 0x83, 0x1e, 0x4a, 0x93,
+ 0x2c, 0x09, 0xbe, 0x3e, 0x53, 0x93, 0x37, 0x6d,
+ 0xaa, 0x59, 0x95, 0x48, 0xb8, 0x16, 0x03, 0x1d,
+ 0x22, 0x4b, 0xbf, 0x50, 0xa8, 0x18, 0xed, 0x23,
+ 0x50, 0xea, 0xe7, 0xe9, 0x60, 0x87, 0xc8, 0xa0,
+ 0xdb, 0x51, 0xad, 0x29, 0x0b, 0xd0, 0x0c, 0x1a,
+ 0xc1, 0x62, 0x08, 0x57, 0x63, 0x5b, 0xf2, 0x46,
+ 0xc1, 0x76, 0xab, 0x46, 0x3b, 0xe3, 0x0b, 0x80,
+ 0x8d, 0xa5, 0x48, 0x08, 0x1a, 0xc8, 0x47, 0xb1,
+ 0x58, 0xe1, 0x26, 0x4b, 0xe2, 0x5b, 0xb0, 0x91,
+ 0x0b, 0xbc, 0x92, 0x64, 0x71, 0x08, 0x08, 0x94,
+ 0x15, 0xd4, 0x5f, 0xab, 0x1b, 0x3d, 0x26, 0x04,
+ 0xe8, 0xa8, 0xef, 0xf1, 0xae, 0x40, 0x20, 0xcf,
+ 0xa3, 0x99, 0x36, 0xb6, 0x68, 0x27, 0xb2, 0x3f,
+ 0x37, 0x1b, 0x92, 0x20, 0x0b, 0xe9, 0x02, 0x51,
+ 0xe6, 0xd7, 0x3c, 0x5f, 0x86, 0xde, 0x5f, 0xd4,
+ 0xa9, 0x50, 0x78, 0x19, 0x33, 0xd7, 0x9a, 0x28,
+ 0x27, 0x2b, 0x78, 0x2a, 0x2e, 0xc3, 0x13, 0xef,
+ 0xdf, 0xcc, 0x06, 0x28, 0xf4, 0x3d, 0x74, 0x4c,
+ 0x2d, 0xc2, 0xff, 0x3d, 0xcb, 0x66, 0x99, 0x9b,
+ 0x50, 0xc7, 0xca, 0x89, 0x5b, 0x0c, 0x64, 0x79,
+ 0x1e, 0xea, 0xa5, 0xf2, 0x94, 0x99, 0xfb, 0x1c,
+ 0x02, 0x6f, 0x84, 0xce, 0x5b, 0x5c, 0x72, 0xba,
+ 0x10, 0x83, 0xcd, 0xdb, 0x5c, 0xe4, 0x54, 0x34,
+ 0x63, 0x16, 0x65, 0xc3, 0x33, 0xb6, 0x0b, 0x11,
+ 0x59, 0x3f, 0xb2, 0x53, 0xc5, 0x17, 0x9a, 0x2c,
+ 0x8d, 0xb8, 0x13, 0x78, 0x2a, 0x00, 0x48, 0x56,
+ 0xa1, 0x65, 0x30, 0x11, 0xe9, 0x3f, 0xb6, 0xd8,
+ 0x76, 0xc1, 0x83, 0x66, 0xdd, 0x86, 0x83, 0xf5,
+ 0x34, 0x12, 0xc0, 0xc1, 0x80, 0xf9, 0xc8, 0x48,
+ 0x59, 0x2d, 0x59, 0x3f, 0x86, 0x09, 0xca, 0x73,
+ 0x63, 0x17, 0xd3, 0x56, 0xe1, 0x3e, 0x2b, 0xff,
+ 0x3a, 0x9f, 0x59, 0xcd, 0x9a, 0xeb, 0x19, 0xcd,
+ 0x48, 0x25, 0x93, 0xd8, 0xc4, 0x61, 0x28, 0xbb,
+ 0x32, 0x42, 0x3b, 0x37, 0xa9, 0xad, 0xfb, 0x48,
+ 0x2b, 0x99, 0x45, 0x3f, 0xbe, 0x25, 0xa4, 0x1b,
+ 0xf6, 0xfe, 0xb4, 0xaa, 0x0b, 0xef, 0x5e, 0xd2,
+ 0x4b, 0xf7, 0x3c, 0x76, 0x29, 0x78, 0x02, 0x54,
+ 0x82, 0xc1, 0x31, 0x15, 0xe4, 0x01, 0x5a, 0xac,
+ 0x99, 0x2e, 0x56, 0x13, 0xa3, 0xb5, 0xc2, 0xf6,
+ 0x85, 0xb8, 0x47, 0x95, 0xcb, 0x6e, 0x9b, 0x26,
+ 0x56, 0xd8, 0xc8, 0x81, 0x57, 0xe5, 0x2c, 0x42,
+ 0xf9, 0x78, 0xd8, 0x63, 0x4c, 0x43, 0xd0, 0x6f,
+ 0xea, 0x92, 0x8f, 0x28, 0x22, 0xe4, 0x65, 0xaa,
+ 0x65, 0x76, 0xe9, 0xbf, 0x41, 0x93, 0x84, 0x50,
+ 0x6c, 0xc3, 0xce, 0x3c, 0x54, 0xac, 0x1a, 0x6f,
+ 0x67, 0xdc, 0x66, 0xf3, 0xb3, 0x01, 0x91, 0xe6,
+ 0x98, 0x38, 0x0b, 0xc9, 0x99, 0xb0, 0x5a, 0xbc,
+ 0xe1, 0x9d, 0xc0, 0xc6, 0xdc, 0xc2, 0xdd, 0x00,
+ 0x1e, 0xc5, 0x35, 0xba, 0x18, 0xde, 0xb2, 0xdf,
+ 0x1a, 0x10, 0x10, 0x23, 0x10, 0x83, 0x18, 0xc7,
+ 0x5d, 0xc9, 0x86, 0x11, 0xa0, 0x9d, 0xc4, 0x8a,
+ 0x0a, 0xcd, 0xec, 0x67, 0x6f, 0xab, 0xdf, 0x22,
+ 0x2f, 0x07, 0xe0, 0x26, 0xf0, 0x59, 0xb6, 0x72,
+ 0xb5, 0x6e, 0x5c, 0xbc, 0x8e, 0x1d, 0x21, 0xbb,
+ 0xd8, 0x67, 0xdd, 0x92, 0x72, 0x12, 0x05, 0x46,
+ 0x81, 0xd7, 0x0e, 0xa7, 0x37, 0x13, 0x4c, 0xdf,
+ 0xce, 0x93, 0xb6, 0xf8, 0x2a, 0xe2, 0x24, 0x23,
+ 0x27, 0x4e, 0x58, 0xa0, 0x82, 0x1c, 0xc5, 0x50,
+ 0x2e, 0x2d, 0x0a, 0xb4, 0x58, 0x5e, 0x94, 0xde,
+ 0x69, 0x75, 0xbe, 0x5e, 0x0b, 0x4e, 0xfc, 0xe5,
+ 0x1c, 0xd3, 0xe7, 0x0c, 0x25, 0xa1, 0xfb, 0xbb,
+ 0xd6, 0x09, 0xd2, 0x73, 0xad, 0x5b, 0x0d, 0x59,
+ 0x63, 0x1c, 0x53, 0x1f, 0x6a, 0x0a, 0x57, 0xb9
+};
+
+//
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+ // pointers to the statically defined vectors here
+
+ // Vector 10
+ {sizeof(v10_CTX), v10_key1, v10_key2, v10_TW, v10_PTX, v10_CTX}
+ ,
+ // Vector 11
+ {sizeof(v11_CTX), v11_key1, v11_key2, v11_TW, v11_PTX, v11_CTX}
+ ,
+ // Vector 12
+ {sizeof(v12_CTX), v12_key1, v12_key2, v12_TW, v12_PTX, v12_CTX}
+ ,
+ // Vector 13
+ {sizeof(v13_CTX), v13_key1, v13_key2, v13_TW, v13_PTX, v13_CTX}
+ ,
+ // Vector 14
+ {sizeof(v14_CTX), v14_key1, v14_key2, v14_TW, v14_PTX, v14_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
new file mode 100644
index 00000000..2635f998
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_128_enc_sse
+extern XTS_AES_128_enc_avx
+
+extern XTS_AES_128_enc_expanded_key_sse
+extern XTS_AES_128_enc_expanded_key_avx
+
+extern XTS_AES_128_dec_sse
+extern XTS_AES_128_dec_avx
+
+extern XTS_AES_128_dec_expanded_key_sse
+extern XTS_AES_128_dec_expanded_key_avx
+
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_128_enc, XTS_AES_128_enc_expanded_key, XTS_AES_128_dec, and XTS_AES_128_dec_expanded_key
+;;;;
+mbin_interface XTS_AES_128_enc
+mbin_dispatch_init XTS_AES_128_enc, XTS_AES_128_enc_sse, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx
+
+mbin_interface XTS_AES_128_enc_expanded_key
+mbin_dispatch_init XTS_AES_128_enc_expanded_key, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx
+
+mbin_interface XTS_AES_128_dec
+mbin_dispatch_init XTS_AES_128_dec, XTS_AES_128_dec_sse, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx
+
+mbin_interface XTS_AES_128_dec_expanded_key
+mbin_dispatch_init XTS_AES_128_dec_expanded_key, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx
+
+
+;;; func core, ver, snum
+slversion XTS_AES_128_enc, 01, 04, 0071
+slversion XTS_AES_128_enc_expanded_key, 01, 04, 0072
+slversion XTS_AES_128_dec, 01, 04, 0073
+slversion XTS_AES_128_dec_expanded_key, 01, 04, 0074
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
new file mode 100644
index 00000000..3452b4f2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_256_enc_sse
+extern XTS_AES_256_enc_avx
+
+extern XTS_AES_256_enc_expanded_key_sse
+extern XTS_AES_256_enc_expanded_key_avx
+
+extern XTS_AES_256_dec_sse
+extern XTS_AES_256_dec_avx
+
+extern XTS_AES_256_dec_expanded_key_sse
+extern XTS_AES_256_dec_expanded_key_avx
+
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_256_enc, XTS_AES_256_enc_expanded_key, XTS_AES_256_dec, and XTS_AES_256_dec_expanded_key
+;;;;
+mbin_interface XTS_AES_256_enc
+mbin_dispatch_init XTS_AES_256_enc, XTS_AES_256_enc_sse, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx
+
+mbin_interface XTS_AES_256_enc_expanded_key
+mbin_dispatch_init XTS_AES_256_enc_expanded_key, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx
+
+mbin_interface XTS_AES_256_dec
+mbin_dispatch_init XTS_AES_256_dec, XTS_AES_256_dec_sse, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx
+
+mbin_interface XTS_AES_256_dec_expanded_key
+mbin_dispatch_init XTS_AES_256_dec_expanded_key, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx
+
+
+;;; func core, ver, snum
+slversion XTS_AES_256_enc, 01, 04, 0076
+slversion XTS_AES_256_enc_expanded_key, 01, 04, 0077
+slversion XTS_AES_256_dec, 01, 04, 0078
+slversion XTS_AES_256_dec_expanded_key, 01, 04, 0079