diff options
Diffstat (limited to 'src/crypto/isa-l/isa-l_crypto/aes')
145 files changed, 77516 insertions, 0 deletions
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am new file mode 100644 index 000000000..d1f4e5781 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am @@ -0,0 +1,170 @@ +######################################################################## +# Copyright(c) 2011-2016 Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## + +# Assembling AES requires including cbc_common.asm, gcm_defines.asm +src_include += -I $(srcdir)/aes + +extern_hdrs += include/aes_gcm.h include/aes_cbc.h include/aes_xts.h include/aes_keyexp.h + +lsrc_x86_64 += aes/gcm_multibinary.asm aes/gcm_pre.c +lsrc_x86_64 += aes/gcm128_avx_gen2.asm aes/gcm128_avx_gen4.asm aes/gcm128_sse.asm +lsrc_x86_64 += aes/gcm256_avx_gen2.asm aes/gcm256_avx_gen4.asm aes/gcm256_sse.asm +lsrc_x86_64 += aes/gcm128_vaes_avx512.asm aes/gcm256_vaes_avx512.asm +lsrc_x86_64 += aes/gcm128_avx_gen2_nt.asm aes/gcm128_avx_gen4_nt.asm aes/gcm128_sse_nt.asm +lsrc_x86_64 += aes/gcm256_avx_gen2_nt.asm aes/gcm256_avx_gen4_nt.asm aes/gcm256_sse_nt.asm +lsrc_x86_64 += aes/gcm128_vaes_avx512_nt.asm aes/gcm256_vaes_avx512_nt.asm + +lsrc_x86_64 += aes/gcm_multibinary_nt.asm + +lsrc_x86_64 += aes/keyexp_multibinary.asm +lsrc_x86_64 += aes/keyexp_128.asm aes/keyexp_192.asm aes/keyexp_256.asm +lsrc_x86_64 += aes/cbc_multibinary.asm +lsrc_x86_64 += aes/cbc_dec_128_x4_sse.asm aes/cbc_dec_128_x8_avx.asm +lsrc_x86_64 += aes/cbc_dec_192_x4_sse.asm aes/cbc_dec_192_x8_avx.asm +lsrc_x86_64 += aes/cbc_dec_256_x4_sse.asm aes/cbc_dec_256_x8_avx.asm +lsrc_x86_64 += aes/cbc_enc_128_x4_sb.asm aes/cbc_enc_128_x8_sb.asm +lsrc_x86_64 += aes/cbc_enc_192_x4_sb.asm aes/cbc_enc_192_x8_sb.asm +lsrc_x86_64 += aes/cbc_enc_256_x4_sb.asm aes/cbc_enc_256_x8_sb.asm +lsrc_x86_64 += aes/cbc_dec_vaes_avx512.asm +lsrc_x86_64 += aes/cbc_pre.c +lsrc_x86_64 += aes/xts_aes_128_multibinary.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_sse.asm aes/XTS_AES_128_dec_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_sse.asm aes/XTS_AES_128_enc_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_avx.asm aes/XTS_AES_128_dec_expanded_key_avx.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_avx.asm aes/XTS_AES_128_enc_expanded_key_avx.asm +lsrc_x86_64 += aes/xts_aes_256_multibinary.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_avx.asm aes/XTS_AES_256_dec_expanded_key_avx.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_avx.asm aes/XTS_AES_256_enc_expanded_key_avx.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_sse.asm aes/XTS_AES_256_dec_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_sse.asm aes/XTS_AES_256_enc_expanded_key_sse.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_vaes.asm +lsrc_x86_64 += aes/XTS_AES_256_enc_expanded_key_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_enc_expanded_key_vaes.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_vaes.asm +lsrc_x86_64 += aes/XTS_AES_256_dec_expanded_key_vaes.asm +lsrc_x86_64 += aes/XTS_AES_128_dec_expanded_key_vaes.asm + +lsrc_x86_32 += $(lsrc_x86_64) + +lsrc_aarch64 += aes/gcm_pre.c \ + aes/aarch64/gcm_multibinary_aarch64.S \ + aes/aarch64/keyexp_multibinary_aarch64.S \ + aes/aarch64/gcm_aarch64_dispatcher.c \ + aes/aarch64/keyexp_aarch64_dispatcher.c \ + aes/aarch64/keyexp_128_aarch64_aes.S \ + aes/aarch64/keyexp_192_aarch64_aes.S \ + aes/aarch64/keyexp_256_aarch64_aes.S \ + aes/aarch64/aes_gcm_aes_finalize_128.S \ + aes/aarch64/aes_gcm_aes_init.S \ + aes/aarch64/aes_gcm_enc_dec_128.S \ + aes/aarch64/aes_gcm_precomp_128.S \ + aes/aarch64/aes_gcm_update_128.S \ + aes/aarch64/aes_gcm_aes_finalize_256.S \ + aes/aarch64/aes_gcm_consts.S \ + aes/aarch64/aes_gcm_enc_dec_256.S \ + aes/aarch64/aes_gcm_precomp_256.S \ + aes/aarch64/aes_gcm_update_256.S \ + aes/aarch64/xts_aarch64_dispatcher.c \ + aes/aarch64/xts_aes_128_dec.S \ + aes/aarch64/xts_aes_128_enc.S \ + aes/aarch64/xts_keyexp_aes_128_dec.S \ + aes/aarch64/xts_keyexp_aes_128_enc.S \ + aes/aarch64/xts_aes_256_dec.S \ + aes/aarch64/xts_aes_256_enc.S \ + aes/aarch64/xts_keyexp_aes_256_dec.S \ + aes/aarch64/xts_keyexp_aes_256_enc.S \ + aes/aarch64/xts_multibinary_aarch64.S \ + aes/cbc_pre.c \ + aes/aarch64/cbc_multibinary_aarch64.S \ + aes/aarch64/cbc_aarch64_dispatcher.c \ + aes/aarch64/cbc_enc_aes.S \ + aes/aarch64/cbc_dec_aes.S + +other_src += include/multibinary.asm +other_src += include/test.h include/types.h include/reg_sizes.asm +other_src += aes/gcm_defines.asm +other_src += aes/aes_common.asm +other_src += aes/clear_regs.asm +other_src += aes/cbc_common.asm aes/cbc_std_vectors.h +other_src += aes/gcm_vectors.h aes/ossl_helper.h +other_src += aes/xts_128_vect.h +other_src += aes/xts_256_vect.h +other_src += aes/gcm_sse.asm +other_src += aes/gcm_avx_gen2.asm +other_src += aes/gcm_avx_gen4.asm +other_src += aes/gcm_keys_vaes_avx512.asm +other_src += aes/gcm_vaes_avx512.asm + +check_tests += aes/cbc_std_vectors_test +check_tests += aes/gcm_std_vectors_test +check_tests += aes/gcm_nt_std_vectors_test +check_tests += aes/xts_128_test +check_tests += aes/xts_256_test +check_tests += aes/xts_128_expanded_key_test +check_tests += aes/xts_256_expanded_key_test + +unit_tests += aes/cbc_std_vectors_random_test +unit_tests += aes/gcm_std_vectors_random_test +unit_tests += aes/gcm_nt_rand_test +unit_tests += aes/xts_128_rand aes/xts_128_rand_ossl_test +unit_tests += aes/xts_256_rand aes/xts_256_rand_ossl_test + +perf_tests += aes/cbc_ossl_perf +perf_tests += aes/gcm_ossl_perf +perf_tests += aes/xts_128_enc_ossl_perf +perf_tests += aes/xts_256_enc_ossl_perf +perf_tests += aes/xts_128_enc_perf aes/xts_128_dec_perf aes/xts_128_dec_ossl_perf +perf_tests += aes/xts_256_enc_perf aes/xts_256_dec_perf aes/xts_256_dec_ossl_perf + +examples += aes/gcm_simple_example + +cbc_ossl_perf: LDLIBS += -lcrypto +aes_cbc_ossl_perf_LDFLAGS = -lcrypto +cbc_std_vectors_random_test: LDLIBS += -lcrypto +aes_cbc_std_vectors_random_test_LDFLAGS = -lcrypto +gcm_ossl_perf: LDLIBS += -lcrypto +aes_gcm_ossl_perf_LDFLAGS = -lcrypto +gcm_std_vectors_random_test: LDLIBS += -lcrypto +aes_gcm_std_vectors_random_test_LDFLAGS = -lcrypto +gcm_nt_rand_test: LDLIBS += -lcrypto +aes_gcm_nt_rand_test_LDFLAGS = -lcrypto +xts_128_enc_ossl_perf: LDLIBS += -lcrypto +aes_xts_128_enc_ossl_perf_LDFLAGS = -lcrypto +xts_128_dec_ossl_perf: LDLIBS += -lcrypto +aes_xts_128_dec_ossl_perf_LDFLAGS = -lcrypto +xts_128_rand_ossl_test: LDLIBS += -lcrypto +aes_xts_128_rand_ossl_test_LDFLAGS = -lcrypto +xts_256_enc_ossl_perf : LDLIBS += -lcrypto +aes_xts_256_enc_ossl_perf_LDFLAGS = -lcrypto +xts_256_dec_ossl_perf : LDLIBS += -lcrypto +aes_xts_256_dec_ossl_perf_LDFLAGS = -lcrypto +xts_256_rand_ossl_test: LDLIBS += -lcrypto +aes_xts_256_rand_ossl_test_LDFLAGS = -lcrypto diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm new file mode 100644 index 000000000..85582c0df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm @@ -0,0 +1,1778 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_avx( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 9 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%xtmp2 %6 +%define %%ptr_key2 %7 +%define %%ptr_key1 %8 +%define %%ptr_expanded_keys %9 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values are generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_avx, function +XTS_AES_128_dec_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm new file mode 100644 index 000000000..faa7e895e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm @@ -0,0 +1,1748 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_expanded_key_avx, function +XTS_AES_128_dec_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm new file mode 100644 index 000000000..0b1b637be --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm @@ -0,0 +1,1747 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*0] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_expanded_key_sse, function +XTS_AES_128_dec_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm new file mode 100644 index 000000000..7f243949a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm @@ -0,0 +1,1648 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_expanded_key_vaes, function +XTS_AES_128_dec_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_dec_expanded_key_vaes +no_XTS_AES_128_dec_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm new file mode 100644 index 000000000..19f887c2f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm @@ -0,0 +1,1779 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_dec_sse( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 9 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%xtmp2 %6 +%define %%ptr_key2 %7 +%define %%ptr_key1 %8 +%define %%ptr_expanded_keys %9 + + + movdqu %%xkey2, [%%ptr_key2] + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*0], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values are generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_sse, function +XTS_AES_128_dec_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;decrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm new file mode 100644 index 000000000..e3435dd83 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm @@ -0,0 +1,1681 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_vavx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 9 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%xtmp2 %6 +%define %%ptr_key2 %7 +%define %%ptr_key1 %8 +%define %%ptr_expanded_keys %9 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_dec_vaes, function +XTS_AES_128_dec_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdeclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_dec_vaes +no_XTS_AES_128_dec_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm new file mode 100644 index 000000000..819617283 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm @@ -0,0 +1,1531 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_avx, function +XTS_AES_128_enc_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm new file mode 100644 index 000000000..f0f5f02f5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm @@ -0,0 +1,1506 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" +default rel + +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_expanded_key_avx, function +XTS_AES_128_enc_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm new file mode 100644 index 000000000..8ac162c4c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm @@ -0,0 +1,1505 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 11 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*11 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_expanded_key_sse, function +XTS_AES_128_enc_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm new file mode 100644 index 000000000..730fdcba9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm @@ -0,0 +1,1473 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; expanded keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_expanded_key_vaes( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_expanded_key_vaes, function +XTS_AES_128_enc_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_enc_expanded_key_vaes +no_XTS_AES_128_enc_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm new file mode 100644 index 000000000..cbb98cc38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm @@ -0,0 +1,1530 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*19 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*19 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*29 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_sse( +; UINT8 *k2, // key used for tweaking, 16*1 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + + movdqu %%xkey2, [%%ptr_key2] + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*3], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*4], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*5], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*6], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*7], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*8], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*9], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + movdqa [%%ptr_expanded_keys + 16*10], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + + + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_sse, function +XTS_AES_128_enc_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm new file mode 100644 index 000000000..3532ddda5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm @@ -0,0 +1,1498 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 128-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_128_enc_vavx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +%macro key_expansion_128 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + + vmovdqu %%xkey2, [%%ptr_key2] + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1 + key_expansion_128 %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + + + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_128_enc_vaes, function +XTS_AES_128_enc_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweak values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweak values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweak values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenclast xmm8, [keys + 16*10] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7*16) + cmp tmp1, (6*16) + je _num_blocks_is_6 + cmp tmp1, (5*16) + je _num_blocks_is_5 + cmp tmp1, (4*16) + je _num_blocks_is_4 + cmp tmp1, (3*16) + je _num_blocks_is_3 + cmp tmp1, (2*16) + je _num_blocks_is_2 + cmp tmp1, (1*16) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_128_enc_vaes +no_XTS_AES_128_enc_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm new file mode 100644 index 000000000..776525bdd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm @@ -0,0 +1,1962 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 11 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%xtmp2 %8 +%define %%ptr_key2 %9 +%define %%ptr_key1 %10 +%define %%ptr_expanded_keys %11 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_avx, function +XTS_AES_256_dec_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 9 + vaesdec xmm8, [keys + 16*11] ; round 9 + vaesdec xmm8, [keys + 16*12] ; round 9 + vaesdec xmm8, [keys + 16*13] ; round 9 + vaesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm new file mode 100644 index 000000000..d52d0977e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm @@ -0,0 +1,1896 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + vaesdec %%ST5, %%T0 + vaesdec %%ST6, %%T0 + vaesdec %%ST7, %%T0 + vaesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + vaesdeclast %%ST5, %%T0 + vaesdeclast %%ST6, %%T0 + vaesdeclast %%ST7, %%T0 + vaesdeclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_expanded_key_avx, function +XTS_AES_256_dec_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + vmovdqa xmm1, [TW + 16*7] + vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 9 + vaesdec xmm8, [keys + 16*11] ; round 9 + vaesdec xmm8, [keys + 16*12] ; round 9 + vaesdec xmm8, [keys + 16*13] ; round 9 + vaesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm15 + vmovdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm14 + vmovdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm13 + vmovdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm12 + vmovdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm11 + vmovdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm10 + vmovdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + vmovdqa [TW + 16*0] , xmm9 + vmovdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm new file mode 100644 index 000000000..2e77e5e80 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm @@ -0,0 +1,1898 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*14] + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*13] + movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*12] + movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*11] + movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*11] + aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*12] + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*13] + aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*14] + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*0] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_expanded_key_sse, function +XTS_AES_256_dec_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdec xmm8, [keys + 16*10] ; round 9 + aesdec xmm8, [keys + 16*11] ; round 9 + aesdec xmm8, [keys + 16*12] ; round 9 + aesdec xmm8, [keys + 16*13] ; round 9 + aesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm new file mode 100644 index 000000000..69228c18c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm @@ -0,0 +1,1808 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_expanded_key_vaes( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*0] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_expanded_key_vaes, function +XTS_AES_256_dec_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 9 + vaesdec xmm8, [keys + 16*11] ; round 9 + vaesdec xmm8, [keys + 16*12] ; round 9 + vaesdec xmm8, [keys + 16*13] ; round 9 + vaesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_dec_expanded_key_vaes +no_XTS_AES_256_dec_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm new file mode 100644 index 000000000..3904c8a54 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm @@ -0,0 +1,1963 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_dec_sse( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *ct, // ciphertext sector input data +; UINT8 *pt); // plaintext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 10101010b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 11 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%xtmp2 %8 +%define %%ptr_key2 %9 +%define %%ptr_key1 %10 +%define %%ptr_expanded_keys %11 + + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + movdqu %%xkey2_2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + movdqu %%xkey1_2, [%%ptr_key1 + 16*1] + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*13], %%xtmp2 + + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*12], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*11], %%xtmp2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*10], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*9], %%xtmp2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*8], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*7], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*6], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*5], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*4], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*3], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + aesimc %%xtmp2, %%xkey1 + movdqa [%%ptr_expanded_keys+16*2], %%xtmp2 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + aesimc %%xtmp2, %%xkey1_2 + movdqa [%%ptr_expanded_keys+16*1], %%xtmp2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdec %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesdec %%ST1, %%T0 + aesdec %%ST2, %%T0 + aesdec %%ST3, %%T0 + aesdec %%ST4, %%T0 + aesdec %%ST5, %%T0 + aesdec %%ST6, %%T0 + aesdec %%ST7, %%T0 + aesdec %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesdeclast %%ST1, %%T0 + aesdeclast %%ST2, %%T0 + aesdeclast %%ST3, %%T0 + aesdeclast %%ST4, %%T0 + aesdeclast %%ST5, %%T0 + aesdeclast %%ST6, %%T0 + aesdeclast %%ST7, %%T0 + aesdeclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_sse, function +XTS_AES_256_dec_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + + and N_val, 15 ; N_val = N_val mod 16 + je _done_final + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + movdqa xmm1, [TW + 16*7] + movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt + + mov [TW + 16*7], twtempl + mov [TW + 16*7+8], twtemph + + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + jmp _steal_cipher + + +_done_final: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + jmp _done + + +_steal_cipher: + ; start cipher stealing + + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesdec xmm8, [keys + 16*1] ; round 1 + aesdec xmm8, [keys + 16*2] ; round 2 + aesdec xmm8, [keys + 16*3] ; round 3 + aesdec xmm8, [keys + 16*4] ; round 4 + aesdec xmm8, [keys + 16*5] ; round 5 + aesdec xmm8, [keys + 16*6] ; round 6 + aesdec xmm8, [keys + 16*7] ; round 7 + aesdec xmm8, [keys + 16*8] ; round 8 + aesdec xmm8, [keys + 16*9] ; round 9 + aesdec xmm8, [keys + 16*10] ; round 9 + aesdec xmm8, [keys + 16*11] ; round 9 + aesdec xmm8, [keys + 16*12] ; round 9 + aesdec xmm8, [keys + 16*13] ; round 9 + aesdeclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + + + + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + + sub ptr_plaintext, 16*1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm15 + movdqa xmm15, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + jmp _done + + + + + + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + + sub ptr_plaintext, 16*2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm14 + movdqa xmm14, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + jmp _done + + + + + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + + sub ptr_plaintext, 16*3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm13 + movdqa xmm13, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + jmp _done + + + + + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + + sub ptr_plaintext, 16*4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm12 + movdqa xmm12, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + jmp _done + + + + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + + sub ptr_plaintext, 16*5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm11 + movdqa xmm11, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + jmp _done + + + + + + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + + sub ptr_plaintext, 16*6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm10 + movdqa xmm10, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + jmp _done + + + + + + + + + + + + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + + movdqa [TW + 16*0] , xmm9 + movdqa xmm9, [TW+16*1] + + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm new file mode 100644 index 000000000..3e26e5c04 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm @@ -0,0 +1,1875 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS decrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 11 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%xtmp2 %8 +%define %%ptr_key2 %9 +%define %%ptr_key1 %10 +%define %%ptr_expanded_keys %11 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vaesimc %%xtmp2, %%xkey1 + vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vaesimc %%xtmp2, %%xkey1_2 + vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; Original way to generate initial tweak values and load plaintext values +; only used for small blocks +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; Original decrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted +; next 8 Tweak values can be generated +%macro decrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks decrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdec %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdec %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdec %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdec %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdec %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdec %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesdeclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesdeclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesdeclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesdeclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesdeclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesdeclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Decrypt 8 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Decrypt 16 blocks in parallel +; generate next 8 tweak values +%macro decrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesdec %%ST1, %%T0 + vaesdec %%ST2, %%T0 + vaesdec %%ST3, %%T0 + vaesdec %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesdeclast %%ST1, %%T0 + vaesdeclast %%ST2, %%T0 + vaesdeclast %%ST3, %%T0 + vaesdeclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_dec_vaes, function +XTS_AES_256_dec_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext - 16], xmm1 + vmovdqa xmm8, xmm1 + + ; Calc previous tweak + mov tmp1, 1 + kmovq k1, tmp1 + vpsllq xmm13, xmm9, 63 + vpsraq xmm14, xmm13, 63 + vpandq xmm5, xmm14, XWORD(zpoly) + vpxorq xmm9 {k1}, xmm9, xmm5 + vpsrldq xmm10, xmm9, 8 + vpshrdq xmm0, xmm9, xmm10, 1 + vpslldq xmm13, xmm13, 8 + vpxorq xmm0, xmm0, xmm13 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7_remain + vextracti32x4 xmm12, zmm10, 2 + vextracti32x4 xmm13, zmm10, 3 + vinserti32x4 zmm10, xmm13, 2 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + vextracti32x4 xmm8, zmm2, 0x2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_7_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + jmp _ret_ + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6_remain + vextracti32x4 xmm12, zmm10, 1 + vextracti32x4 xmm13, zmm10, 2 + vinserti32x4 zmm10, xmm13, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + vextracti32x4 xmm8, zmm2, 0x1 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_6_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + jmp _ret_ + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5_remain + vmovdqa xmm12, xmm10 + vextracti32x4 xmm10, zmm10, 1 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_5_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4_remain + vextracti32x4 xmm12, zmm9, 3 + vinserti32x4 zmm9, xmm10, 3 + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + vextracti32x4 xmm8, zmm1, 0x3 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_4_remain: + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + jmp _ret_ + +_remaining_num_blocks_is_3: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3_remain + vextracti32x4 xmm13, zmm9, 2 + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 3 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + vmovdqa xmm0, xmm13 + jmp _steal_cipher +_done_3_remain: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + jmp _ret_ + +_remaining_num_blocks_is_2: + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2_remain + vextracti32x4 xmm10, zmm9, 2 + vextracti32x4 xmm12, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + vmovdqa xmm0, xmm12 + jmp _steal_cipher +_done_2_remain: + vextracti32x4 xmm10, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + jmp _ret_ + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + and N_val, 15 + je _done_1_remain + vextracti32x4 xmm11, zmm9, 1 + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + vmovdqa xmm8, xmm1 + vmovdqa xmm0, xmm9 + jmp _steal_cipher +_done_1_remain: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + jmp _ret_ + + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + cmp N_val, 128 + jge _main_loop_run_8 + + jmp _do_n_blocks + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;decrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesdec xmm8, [keys + 16*1] ; round 1 + vaesdec xmm8, [keys + 16*2] ; round 2 + vaesdec xmm8, [keys + 16*3] ; round 3 + vaesdec xmm8, [keys + 16*4] ; round 4 + vaesdec xmm8, [keys + 16*5] ; round 5 + vaesdec xmm8, [keys + 16*6] ; round 6 + vaesdec xmm8, [keys + 16*7] ; round 7 + vaesdec xmm8, [keys + 16*8] ; round 8 + vaesdec xmm8, [keys + 16*9] ; round 9 + vaesdec xmm8, [keys + 16*10] ; round 10 + vaesdec xmm8, [keys + 16*11] ; round 11 + vaesdec xmm8, [keys + 16*12] ; round 12 + vaesdec xmm8, [keys + 16*13] ; round 13 + vaesdeclast xmm8, [keys + 16*14] ; round 14 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + and N_val, 15 + je _done_7 + +_steal_cipher_7: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa64 xmm16, xmm15 + vmovdqa xmm15, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa64 xmm0, xmm16 + vmovdqa xmm8, xmm7 + jmp _steal_cipher + +_done_7: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + jmp _done + +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + and N_val, 15 + je _done_6 + +_steal_cipher_6: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm15, xmm14 + vmovdqa xmm14, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm0, xmm15 + vmovdqa xmm8, xmm6 + jmp _steal_cipher + +_done_6: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + jmp _done + +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + and N_val, 15 + je _done_5 + +_steal_cipher_5: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm14, xmm13 + vmovdqa xmm13, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm0, xmm14 + vmovdqa xmm8, xmm5 + jmp _steal_cipher + +_done_5: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + jmp _done + +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + and N_val, 15 + je _done_4 + +_steal_cipher_4: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm13, xmm12 + vmovdqa xmm12, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm0, xmm13 + vmovdqa xmm8, xmm4 + jmp _steal_cipher + +_done_4: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + jmp _done + +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + and N_val, 15 + je _done_3 + +_steal_cipher_3: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm12, xmm11 + vmovdqa xmm11, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm0, xmm12 + vmovdqa xmm8, xmm3 + jmp _steal_cipher + +_done_3: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + jmp _done + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + and N_val, 15 + je _done_2 + +_steal_cipher_2: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm11, xmm10 + vmovdqa xmm10, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm0, xmm11 + vmovdqa xmm8, xmm2 + jmp _steal_cipher + +_done_2: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + jmp _done + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + and N_val, 15 + je _done_1 + +_steal_cipher_1: + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, xmm9 + vmovdqa xmm9, [TW+16*1] + + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm0, xmm10 + vmovdqa xmm8, xmm1 + jmp _steal_cipher + +_done_1: + decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + jmp _done + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_dec_vaes +no_XTS_AES_256_dec_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm new file mode 100644 index 000000000..0993ff909 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm @@ -0,0 +1,1708 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 10 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%ptr_key2 %8 +%define %%ptr_key1 %9 +%define %%ptr_expanded_keys %10 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_avx, function +XTS_AES_256_enc_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 9 + vaesenc xmm8, [keys + 16*11] ; round 9 + vaesenc xmm8, [keys + 16*12] ; round 9 + vaesenc xmm8, [keys + 16*13] ; round 9 + vaesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm new file mode 100644 index 000000000..6db85486d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm @@ -0,0 +1,1653 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_expanded_key_avx( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 + vpxor %%ST2, %%T0 + vpxor %%ST3, %%T0 + vpxor %%ST4, %%T0 + vpxor %%ST5, %%T0 + vpxor %%ST6, %%T0 + vpxor %%ST7, %%T0 + vpxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + vaesenc %%ST5, %%T0 + vaesenc %%ST6, %%T0 + vaesenc %%ST7, %%T0 + vaesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + vaesenclast %%ST5, %%T0 + vaesenclast %%ST6, %%T0 + vaesenclast %%ST7, %%T0 + vaesenclast %%ST8, %%T0 + + ; xor Tweak values + vpxor %%ST1, %%TW1 + vpxor %%ST2, %%TW2 + vpxor %%ST3, %%TW3 + vpxor %%ST4, %%TW4 + vpxor %%ST5, %%TW5 + vpxor %%ST6, %%TW6 + vpxor %%ST7, %%TW7 + vpxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_expanded_key_avx, function +XTS_AES_256_enc_expanded_key_avx: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + vmovdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + vmovdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + vmovdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + vmovdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + vmovdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + vmovdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;vmovdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + vmovdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + vmovdqu xmm4, [ptr_plaintext+16*3] + vmovdqu xmm5, [ptr_plaintext+16*4] + vmovdqu xmm6, [ptr_plaintext+16*5] + vmovdqu xmm7, [ptr_plaintext+16*6] + vmovdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm0, [twtempl+N_val] + vpshufb xmm8, xmm0 + + + vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + vmovdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm0, [twtempl] + vpxor xmm0, [mask1] + vpshufb xmm3, xmm0 + + vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit + + ; xor Tweak value + vmovdqa xmm8, [TW] + vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped + + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 9 + vaesenc xmm8, [keys + 16*11] ; round 9 + vaesenc xmm8, [keys + 16*12] ; round 9 + vaesenc xmm8, [keys + 16*13] ; round 9 + vaesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, [TW] + +_done: + ; store last ciphertext value + vmovdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm new file mode 100644 index 000000000..51cb31074 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm @@ -0,0 +1,1652 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; expanded keys are not aligned +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_expanded_key_sse( +; UINT8 *k2, // key used for tweaking, 16*15 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*2] + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*2] + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*3] + aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*3] + movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*4] + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*4] + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*5] + aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*5] + movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*6] + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*6] + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*7] + aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*7] + movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*8] + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*8] + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*9] + aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*9] + movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*10] + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*10] + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + + movdqu %%xkey2, [%%ptr_key2 + 16*11] + aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*11] + movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*12] + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*12] + movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*13] + aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*13] + movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + movdqu %%xkey2, [%%ptr_key2 + 16*14] + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + movdqu %%xkey1, [%%ptr_key1 + 16*14] + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_expanded_key_sse, function +XTS_AES_256_enc_expanded_key_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenc xmm8, [keys + 16*10] ; round 9 + aesenc xmm8, [keys + 16*11] ; round 9 + aesenc xmm8, [keys + 16*12] ; round 9 + aesenc xmm8, [keys + 16*13] ; round 9 + aesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm new file mode 100644 index 000000000..37a5dc792 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm @@ -0,0 +1,1634 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; expanded keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_expanded_key_vaes( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; macro to encrypt the tweak value + +%macro encrypt_T 8 +%define %%xkey2 %1 +%define %%xstate_tweak %2 +%define %%xkey1 %3 +%define %%xraw_key %4 +%define %%xtmp %5 +%define %%ptr_key2 %6 +%define %%ptr_key1 %7 +%define %%ptr_expanded_keys %8 + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*2] + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*2] + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*3] + vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*3] + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*4] + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*4] + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*5] + vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*5] + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*6] + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*6] + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*7] + vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*7] + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*8] + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*8] + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*9] + vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*9] + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*10] + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*10] + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack + + + vmovdqu %%xkey2, [%%ptr_key2 + 16*11] + vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*11] + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*12] + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*12] + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*13] + vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*13] + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack + + vmovdqu %%xkey2, [%%ptr_key2 + 16*14] + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1 + 16*14] + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_expanded_key_vaes, function +XTS_AES_256_enc_expanded_key_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 9 + vaesenc xmm8, [keys + 16*11] ; round 9 + vaesenc xmm8, [keys + 16*12] ; round 9 + vaesenc xmm8, [keys + 16*13] ; round 9 + vaesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_enc_expanded_key_vaes +no_XTS_AES_256_enc_expanded_key_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm new file mode 100644 index 000000000..5b805b74d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm @@ -0,0 +1,1708 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_sse( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define target_ptr_val rsi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define target_ptr_val rdx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx + + +; produce the key for the next round +; raw_key is the output of aeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 11111111b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + pshufd %%xraw_key, %%xraw_key, 10101010b + shufps %%xtmp, %%xround_key, 00010000b + pxor %%xround_key, %%xtmp + shufps %%xtmp, %%xround_key, 10001100b + pxor %%xround_key, %%xtmp + pxor %%xround_key, %%xraw_key +%endmacro + + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 10 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%ptr_key2 %8 +%define %%ptr_key1 %9 +%define %%ptr_expanded_keys %10 + + + movdqu %%xkey2, [%%ptr_key2] + pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + movdqu %%xkey1, [%%ptr_key1] + movdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + movdqu %%xkey2_2, [%%ptr_key2 + 16*1] + aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + movdqu %%xkey1_2, [%%ptr_key1 + 16*1] + movdqa [%%ptr_expanded_keys+16*1], %%xkey1_2 + + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + movdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + movdqa [%%ptr_expanded_keys+16*3], %%xkey1_2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + movdqa [%%ptr_expanded_keys+16*4], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + movdqa [%%ptr_expanded_keys+16*5], %%xkey1_2 + + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + movdqa [%%ptr_expanded_keys+16*6], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + movdqa [%%ptr_expanded_keys+16*7], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + movdqa [%%ptr_expanded_keys+16*8], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + movdqa [%%ptr_expanded_keys+16*9], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + movdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + movdqa [%%ptr_expanded_keys+16*11], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + movdqa [%%ptr_expanded_keys+16*12], %%xkey1 + + aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + movdqa [%%ptr_expanded_keys+16*13], %%xkey1_2 + + + aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + movdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + movdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + movdqa %%TW2, [TW+16*1] + movdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + movdqa %%TW3, [TW+16*2] + movdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + movdqa %%TW4, [TW+16*3] + movdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + movdqa %%TW5, [TW+16*4] + movdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + movdqa %%TW6, [TW+16*5] + movdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + movdqa %%TW7, [TW+16*6] + movdqu %%ST7, [ptr_plaintext+16*6] +%endif + + + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 +%if (%%num_blocks>=2) + pxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenc %%ST7, %%T0 +%endif + + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + aesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + aesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + aesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + aesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + aesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + aesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + pxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + pxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + pxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + pxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + pxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + pxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + pxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%TW8 %16 ; tweak 8 +%define %%T0 %17 ; Temp register +%define %%last_eight %18 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + ; ARK + movdqa %%T0, [keys] + pxor %%ST1, %%T0 + pxor %%ST2, %%T0 + pxor %%ST3, %%T0 + pxor %%ST4, %%T0 + pxor %%ST5, %%T0 + pxor %%ST6, %%T0 + pxor %%ST7, %%T0 + pxor %%ST8, %%T0 + +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 1 + movdqa %%T0, [keys + 16*1] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 2 + movdqa %%T0, [keys + 16*2] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + +%endif + ; round 3 + movdqa %%T0, [keys + 16*3] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*2], twtempl + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 4 + movdqa %%T0, [keys + 16*4] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl +%endif + ; round 5 + movdqa %%T0, [keys + 16*5] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 6 + movdqa %%T0, [keys + 16*6] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl + mov [TW + 8*7], twtemph +%endif + ; round 7 + movdqa %%T0, [keys + 16*7] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b +%endif + ; round 8 + movdqa %%T0, [keys + 16*8] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl + mov [TW + 8*9], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp +%endif + ; round 9 + movdqa %%T0, [keys + 16*9] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +%endif + ; round 10 + movdqa %%T0, [keys + 16*10] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*10], twtempl + mov [TW + 8*11], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 +%endif + ; round 11 + movdqa %%T0, [keys + 16*11] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl +%endif + ; round 12 + movdqa %%T0, [keys + 16*12] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + mov [TW + 8*13], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph +%endif + ; round 13 + movdqa %%T0, [keys + 16*13] + aesenc %%ST1, %%T0 + aesenc %%ST2, %%T0 + aesenc %%ST3, %%T0 + aesenc %%ST4, %%T0 + aesenc %%ST5, %%T0 + aesenc %%ST6, %%T0 + aesenc %%ST7, %%T0 + aesenc %%ST8, %%T0 +%if (0 == %%last_eight) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp +; mov [TW + 8*14], twtempl +; mov [TW + 8*15], twtemph +%endif + ; round 14 + movdqa %%T0, [keys + 16*14] + aesenclast %%ST1, %%T0 + aesenclast %%ST2, %%T0 + aesenclast %%ST3, %%T0 + aesenclast %%ST4, %%T0 + aesenclast %%ST5, %%T0 + aesenclast %%ST6, %%T0 + aesenclast %%ST7, %%T0 + aesenclast %%ST8, %%T0 + + ; xor Tweak values + pxor %%ST1, %%TW1 + pxor %%ST2, %%TW2 + pxor %%ST3, %%TW3 + pxor %%ST4, %%TW4 + pxor %%ST5, %%TW5 + pxor %%ST6, %%TW6 + pxor %%ST7, %%TW7 + pxor %%ST8, %%TW8 + + mov [TW + 8*14], twtempl + mov [TW + 8*15], twtemph + ; load next Tweak values + movdqa %%TW1, [TW + 16*0] + movdqa %%TW2, [TW + 16*1] + movdqa %%TW3, [TW + 16*2] + movdqa %%TW4, [TW + 16*3] + movdqa %%TW5, [TW + 16*4] + movdqa %%TW6, [TW + 16*5] + movdqa %%TW7, [TW + 16*6] + +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_sse, function +XTS_AES_256_enc_sse: + endbranch + + sub rsp, VARIABLE_OFFSET + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + movdqa [_xmm + 16*0], xmm6 + movdqa [_xmm + 16*1], xmm7 + movdqa [_xmm + 16*2], xmm8 + movdqa [_xmm + 16*3], xmm9 + movdqa [_xmm + 16*4], xmm10 + movdqa [_xmm + 16*5], xmm11 + movdqa [_xmm + 16*6], xmm12 + movdqa [_xmm + 16*7], xmm13 + movdqa [_xmm + 16*8], xmm14 + movdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + movdqu xmm1, [T_val] ; read initial Tweak value + pxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + + + mov target_ptr_val, N_val + and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16) + sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations + jl _less_than_128_bytes + + add target_ptr_val, ptr_ciphertext + + + mov tmp1, N_val + and tmp1, (7 << 4) + jz _initial_num_blocks_is_0 + + cmp tmp1, (4 << 4) + je _initial_num_blocks_is_4 + + + + cmp tmp1, (6 << 4) + je _initial_num_blocks_is_6 + + cmp tmp1, (5 << 4) + je _initial_num_blocks_is_5 + + + + cmp tmp1, (3 << 4) + je _initial_num_blocks_is_3 + + cmp tmp1, (2 << 4) + je _initial_num_blocks_is_2 + + cmp tmp1, (1 << 4) + je _initial_num_blocks_is_1 + +_initial_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + add ptr_ciphertext, 16*6 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + add ptr_ciphertext, 16*5 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + add ptr_ciphertext, 16*4 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + + +_initial_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop +_initial_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + movdqu [ptr_ciphertext+16], xmm2 + add ptr_ciphertext, 16*2 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + cmp ptr_ciphertext, target_ptr_val + je _last_eight + + jmp _main_loop + +_initial_num_blocks_is_0: + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + movdqa xmm9, [TW+16*0] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph + movdqa xmm10, [TW+16*1] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph + movdqa xmm11, [TW+16*2] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph + movdqa xmm12, [TW+16*3] + + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph + movdqa xmm13, [TW+16*4] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph + movdqa xmm14, [TW+16*5] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph + movdqa xmm15, [TW+16*6] + + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*14], twtempl + mov [TW+8*15], twtemph + ;movdqa xmm16, [TW+16*7] + + cmp ptr_ciphertext, target_ptr_val + je _last_eight +_main_loop: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + + add ptr_plaintext, 128 + + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + movdqu [ptr_ciphertext+16*7], xmm8 + add ptr_ciphertext, 128 + + cmp ptr_ciphertext, target_ptr_val + jne _main_loop + +_last_eight: + ; load plaintext + movdqu xmm1, [ptr_plaintext+16*0] + movdqu xmm2, [ptr_plaintext+16*1] + movdqu xmm3, [ptr_plaintext+16*2] + movdqu xmm4, [ptr_plaintext+16*3] + movdqu xmm5, [ptr_plaintext+16*4] + movdqu xmm6, [ptr_plaintext+16*5] + movdqu xmm7, [ptr_plaintext+16*6] + movdqu xmm8, [ptr_plaintext+16*7] + encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1 + + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + movdqu [ptr_ciphertext+16*6], xmm7 + + + and N_val, 15 ; N_val = N_val mod 16 + je _done +_steal_cipher: + ; start cipher stealing + + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + + movdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [pshufb_shf_table] + movdqu xmm0, [twtempl+N_val] + pshufb xmm8, xmm0 + + + movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move + movdqu [ptr_ciphertext + 112 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [pshufb_shf_table +16] + sub twtempl, N_val + movdqu xmm0, [twtempl] + pxor xmm0, [mask1] + pshufb xmm3, xmm0 + + pblendvb xmm3, xmm2 ;xmm0 is implicit + + ; xor Tweak value + movdqa xmm8, [TW] + pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped + + + ;encrypt last block with cipher stealing + pxor xmm8, [keys] ; ARK + aesenc xmm8, [keys + 16*1] ; round 1 + aesenc xmm8, [keys + 16*2] ; round 2 + aesenc xmm8, [keys + 16*3] ; round 3 + aesenc xmm8, [keys + 16*4] ; round 4 + aesenc xmm8, [keys + 16*5] ; round 5 + aesenc xmm8, [keys + 16*6] ; round 6 + aesenc xmm8, [keys + 16*7] ; round 7 + aesenc xmm8, [keys + 16*8] ; round 8 + aesenc xmm8, [keys + 16*9] ; round 9 + aesenc xmm8, [keys + 16*10] ; round 9 + aesenc xmm8, [keys + 16*11] ; round 9 + aesenc xmm8, [keys + 16*12] ; round 9 + aesenc xmm8, [keys + 16*13] ; round 9 + aesenclast xmm8, [keys + 16*14] ; round 10 + + ; xor Tweak value + pxor xmm8, [TW] + +_done: + ; store last ciphertext value + movdqu [ptr_ciphertext+16*7], xmm8 + +_ret_: + + mov rbx, [_gpr + 8*0] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + + movdqa xmm6, [_xmm + 16*0] + movdqa xmm7, [_xmm + 16*1] + movdqa xmm8, [_xmm + 16*2] + movdqa xmm9, [_xmm + 16*3] + movdqa xmm10, [_xmm + 16*4] + movdqa xmm11, [_xmm + 16*5] + movdqa xmm12, [_xmm + 16*6] + movdqa xmm13, [_xmm + 16*7] + movdqa xmm14, [_xmm + 16*8] + movdqa xmm15, [_xmm + 16*9] +%endif + + add rsp, VARIABLE_OFFSET + + ret + + + + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + sub ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + movdqu [ptr_ciphertext+16*5], xmm6 + + sub ptr_ciphertext, 16*1 + movdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + sub ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + movdqu [ptr_ciphertext+16*4], xmm5 + + sub ptr_ciphertext, 16*2 + movdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + sub ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + movdqu [ptr_ciphertext+16*3], xmm4 + + sub ptr_ciphertext, 16*3 + movdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + sub ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + movdqu [ptr_ciphertext+16*2], xmm3 + + sub ptr_ciphertext, 16*4 + movdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + sub ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + movdqu [ptr_ciphertext+16*0], xmm1 + movdqu [ptr_ciphertext+16*1], xmm2 + + sub ptr_ciphertext, 16*5 + movdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + sub ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + movdqu [ptr_ciphertext], xmm1 + + sub ptr_ciphertext, 16*6 + movdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + + sub ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + ; store ciphertext + + sub ptr_ciphertext, 16*7 + movdqa xmm8, xmm1 + + and N_val, 15 ; N_val = N_val mod 16 + je _done + jmp _steal_cipher + +section .data +align 16 + +pshufb_shf_table: +; use these values for shift constants for the pshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm new file mode 100644 index 000000000..f75497ece --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm @@ -0,0 +1,1687 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2020 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; XTS encrypt function with 256-bit AES +; input keys are not aligned +; keys are expanded in parallel with the tweak encryption +; plaintext and ciphertext are not aligned +; second key is stored in the stack as aligned to 16 Bytes +; first key is required only once, no need for storage of this key + +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +default rel +%define TW rsp ; store 8 tweak values +%define keys rsp + 16*8 ; store 15 expanded keys + +%ifidn __OUTPUT_FORMAT__, win64 + %define _xmm rsp + 16*23 ; store xmm6:xmm15 +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 +%define _gpr rsp + 16*23 ; store rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%else +%define _gpr rsp + 16*33 ; store rdi, rsi, rbx +%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8 +%endif + +%define GHASH_POLY 0x87 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void XTS_AES_256_enc_avx( +; UINT8 *k2, // key used for tweaking, 16*2 bytes +; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes +; UINT8 *TW_initial, // initial tweak value, 16 bytes +; UINT64 N, // sector size, in bytes +; const UINT8 *pt, // plaintext sector input data +; UINT8 *ct); // ciphertext sector output data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; arguments for input parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %xdefine ptr_key2 rdi + %xdefine ptr_key1 rsi + %xdefine T_val rdx + %xdefine N_val rcx + %xdefine ptr_plaintext r8 + %xdefine ptr_ciphertext r9 +%else + %xdefine ptr_key2 rcx + %xdefine ptr_key1 rdx + %xdefine T_val r8 + %xdefine N_val r9 + %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5] + %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6] +%endif + +; arguments for temp parameters +%ifidn __OUTPUT_FORMAT__, elf64 + %define tmp1 rdi + %define ghash_poly_8b r10 + %define ghash_poly_8b_temp r11 +%else + %define tmp1 rcx + %define ghash_poly_8b rdi + %define ghash_poly_8b_temp rsi +%endif + +%define twtempl rax ; global temp registers used for tweak computation +%define twtemph rbx +%define zpoly zmm25 + +; produce the key for the next round +; raw_key is the output of vaeskeygenassist instruction +; round_key value before this key_expansion_128 macro is current round key +; round_key value after this key_expansion_128 macro is next round key +; 2 macros will be used for key generation in a flip-flopped fashion +%macro key_expansion_256_flip 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 11111111b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + +%macro key_expansion_256_flop 3 +%define %%xraw_key %1 +%define %%xtmp %2 +%define %%xround_key %3 + vpshufd %%xraw_key, %%xraw_key, 10101010b + vshufps %%xtmp, %%xround_key, 00010000b + vpxor %%xround_key, %%xtmp + vshufps %%xtmp, %%xround_key, 10001100b + vpxor %%xround_key, %%xtmp + vpxor %%xround_key, %%xraw_key +%endmacro + + + + +; macro to encrypt the tweak value in parallel with key generation of both keys + +%macro encrypt_T 10 +%define %%xkey2 %1 +%define %%xkey2_2 %2 +%define %%xstate_tweak %3 +%define %%xkey1 %4 +%define %%xkey1_2 %5 +%define %%xraw_key %6 +%define %%xtmp %7 +%define %%ptr_key2 %8 +%define %%ptr_key1 %9 +%define %%ptr_expanded_keys %10 + + + vmovdqu %%xkey2, [%%ptr_key2] + vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption + + vmovdqu %%xkey1, [%%ptr_key1] + vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 + + vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1] + vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption + + vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1] + vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2 + + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2 + + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 + + vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2 + vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1 + key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2 + vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2 + + + vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2 + vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1 + key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1 + vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption + vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 + + vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value +%endmacro + + +; generate initial tweak values +; load initial plaintext values +%macro initialize 16 + +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 + +%define %%num_initial_blocks %16 + + + ; generate next Tweak values + vmovdqa %%TW1, [TW+16*0] + mov twtempl, [TW+8*0] + mov twtemph, [TW+8*1] + vmovdqu %%ST1, [ptr_plaintext+16*0] +%if (%%num_initial_blocks>=2) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*2], twtempl + mov [TW+8*3], twtemph; + vmovdqa %%TW2, [TW+16*1] + vmovdqu %%ST2, [ptr_plaintext+16*1] +%endif +%if (%%num_initial_blocks>=3) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*4], twtempl + mov [TW+8*5], twtemph; + vmovdqa %%TW3, [TW+16*2] + vmovdqu %%ST3, [ptr_plaintext+16*2] +%endif +%if (%%num_initial_blocks>=4) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*6], twtempl + mov [TW+8*7], twtemph; + vmovdqa %%TW4, [TW+16*3] + vmovdqu %%ST4, [ptr_plaintext+16*3] +%endif +%if (%%num_initial_blocks>=5) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*8], twtempl + mov [TW+8*9], twtemph; + vmovdqa %%TW5, [TW+16*4] + vmovdqu %%ST5, [ptr_plaintext+16*4] +%endif +%if (%%num_initial_blocks>=6) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*10], twtempl + mov [TW+8*11], twtemph; + vmovdqa %%TW6, [TW+16*5] + vmovdqu %%ST6, [ptr_plaintext+16*5] +%endif +%if (%%num_initial_blocks>=7) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW+8*12], twtempl + mov [TW+8*13], twtemph; + vmovdqa %%TW7, [TW+16*6] + vmovdqu %%ST7, [ptr_plaintext+16*6] +%endif + +%endmacro + + +; encrypt initial blocks of AES +; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted +; next 8 Tweak values are generated +%macro encrypt_initial 18 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 +%define %%ST5 %5 ; state 5 +%define %%ST6 %6 ; state 6 +%define %%ST7 %7 ; state 7 +%define %%ST8 %8 ; state 8 + +%define %%TW1 %9 ; tweak 1 +%define %%TW2 %10 ; tweak 2 +%define %%TW3 %11 ; tweak 3 +%define %%TW4 %12 ; tweak 4 +%define %%TW5 %13 ; tweak 5 +%define %%TW6 %14 ; tweak 6 +%define %%TW7 %15 ; tweak 7 +%define %%T0 %16 ; Temp register +%define %%num_blocks %17 +; %%num_blocks blocks encrypted +; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7 + +%define %%lt128 %18 ; less than 128 bytes + + ; xor Tweak value + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + + ; ARK + vmovdqa %%T0, [keys] + vpxor %%ST1, %%T0 +%if (%%num_blocks>=2) + vpxor %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%T0 +%endif + + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + %endif + + ; round 1 + vmovdqa %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*0], twtempl ; next Tweak1 generated + mov [TW + 8*1], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + %endif + + ; round 2 + vmovdqa %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*2], twtempl ; next Tweak2 generated + %endif + + ; round 3 + vmovdqa %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + %if (0 == %%lt128) + mov [TW + 8*3], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + %endif + + ; round 4 + vmovdqa %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*4], twtempl ; next Tweak3 generated + mov [TW + 8*5], twtemph + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + %endif + + ; round 5 + vmovdqa %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*6], twtempl ; next Tweak4 generated + mov [TW + 8*7], twtemph + %endif + + ; round 6 + vmovdqa %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*8], twtempl ; next Tweak5 generated + mov [TW + 8*9], twtemph + %endif + + ; round 7 + vmovdqa %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*10], twtempl ; next Tweak6 generated + mov [TW + 8*11], twtemph + %endif + ; round 8 + vmovdqa %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*12], twtempl ; next Tweak7 generated + mov [TW + 8*13], twtemph + %endif + ; round 9 + vmovdqa %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + %if (0 == %%lt128) + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW + 8*14], twtempl ; next Tweak8 generated + mov [TW + 8*15], twtemph + %endif + ; round 10 + vmovdqa %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + ; round 11 + vmovdqa %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 12 + vmovdqa %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 13 + vmovdqa %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenc %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenc %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenc %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenc %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenc %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenc %%ST7, %%T0 +%endif + + ; round 14 + vmovdqa %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 +%if (%%num_blocks>=2) + vaesenclast %%ST2, %%T0 +%endif +%if (%%num_blocks>=3) + vaesenclast %%ST3, %%T0 +%endif +%if (%%num_blocks>=4) + vaesenclast %%ST4, %%T0 +%endif +%if (%%num_blocks>=5) + vaesenclast %%ST5, %%T0 +%endif +%if (%%num_blocks>=6) + vaesenclast %%ST6, %%T0 +%endif +%if (%%num_blocks>=7) + vaesenclast %%ST7, %%T0 +%endif + + ; xor Tweak values + vpxor %%ST1, %%TW1 +%if (%%num_blocks>=2) + vpxor %%ST2, %%TW2 +%endif +%if (%%num_blocks>=3) + vpxor %%ST3, %%TW3 +%endif +%if (%%num_blocks>=4) + vpxor %%ST4, %%TW4 +%endif +%if (%%num_blocks>=5) + vpxor %%ST5, %%TW5 +%endif +%if (%%num_blocks>=6) + vpxor %%ST6, %%TW6 +%endif +%if (%%num_blocks>=7) + vpxor %%ST7, %%TW7 +%endif + + +%if (0 == %%lt128) + ; load next Tweak values + vmovdqa %%TW1, [TW + 16*0] + vmovdqa %%TW2, [TW + 16*1] + vmovdqa %%TW3, [TW + 16*2] + vmovdqa %%TW4, [TW + 16*3] + vmovdqa %%TW5, [TW + 16*4] + vmovdqa %%TW6, [TW + 16*5] + vmovdqa %%TW7, [TW + 16*6] + +%endif + +%endmacro + + + + +; Encrypt 8 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_eight_zmm 6 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%TW1 %3 ; tweak 1 +%define %%TW2 %4 ; tweak 2 +%define %%T0 %5 ; Temp register +%define %%last_eight %6 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW1, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW1, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW2, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW2, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 +%endmacro + + +; Encrypt 16 blocks in parallel +; generate next 8 tweak values +%macro encrypt_by_16_zmm 10 +%define %%ST1 %1 ; state 1 +%define %%ST2 %2 ; state 2 +%define %%ST3 %3 ; state 3 +%define %%ST4 %4 ; state 4 + +%define %%TW1 %5 ; tweak 1 +%define %%TW2 %6 ; tweak 2 +%define %%TW3 %7 ; tweak 3 +%define %%TW4 %8 ; tweak 4 + +%define %%T0 %9 ; Temp register +%define %%last_eight %10 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; ARK + vbroadcasti32x4 %%T0, [keys] + vpxorq %%ST1, %%T0 + vpxorq %%ST2, %%T0 + vpxorq %%ST3, %%T0 + vpxorq %%ST4, %%T0 + +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW3, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm15, %%TW3, 1 + vpxord zmm15, zmm15, zmm14 +%endif + ; round 1 + vbroadcasti32x4 %%T0, [keys + 16*1] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 2 + vbroadcasti32x4 %%T0, [keys + 16*2] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 3 + vbroadcasti32x4 %%T0, [keys + 16*3] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, %%TW4, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm16, %%TW4, 1 + vpxord zmm16, zmm16, zmm14 +%endif + ; round 4 + vbroadcasti32x4 %%T0, [keys + 16*4] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 5 + vbroadcasti32x4 %%T0, [keys + 16*5] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 6 + vbroadcasti32x4 %%T0, [keys + 16*6] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm15, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm17, zmm15, 1 + vpxord zmm17, zmm17, zmm14 +%endif + ; round 7 + vbroadcasti32x4 %%T0, [keys + 16*7] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 8 + vbroadcasti32x4 %%T0, [keys + 16*8] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 9 + vbroadcasti32x4 %%T0, [keys + 16*9] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 +%if (0 == %%last_eight) + vpsrldq zmm13, zmm16, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm18, zmm16, 1 + vpxord zmm18, zmm18, zmm14 +%endif + ; round 10 + vbroadcasti32x4 %%T0, [keys + 16*10] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 11 + vbroadcasti32x4 %%T0, [keys + 16*11] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 12 + vbroadcasti32x4 %%T0, [keys + 16*12] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 13 + vbroadcasti32x4 %%T0, [keys + 16*13] + vaesenc %%ST1, %%T0 + vaesenc %%ST2, %%T0 + vaesenc %%ST3, %%T0 + vaesenc %%ST4, %%T0 + + ; round 14 + vbroadcasti32x4 %%T0, [keys + 16*14] + vaesenclast %%ST1, %%T0 + vaesenclast %%ST2, %%T0 + vaesenclast %%ST3, %%T0 + vaesenclast %%ST4, %%T0 + + ; xor Tweak values + vpxorq %%ST1, %%TW1 + vpxorq %%ST2, %%TW2 + vpxorq %%ST3, %%TW3 + vpxorq %%ST4, %%TW4 + + ; load next Tweak values + vmovdqa32 %%TW1, zmm15 + vmovdqa32 %%TW2, zmm16 + vmovdqa32 %%TW3, zmm17 + vmovdqa32 %%TW4, zmm18 +%endmacro + + +section .text + +mk_global XTS_AES_256_enc_vaes, function +XTS_AES_256_enc_vaes: + endbranch + +%define ALIGN_STACK +%ifdef ALIGN_STACK + push rbp + mov rbp, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~63 +%else + sub rsp, VARIABLE_OFFSET +%endif + + mov [_gpr + 8*0], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [_gpr + 8*1], rdi + mov [_gpr + 8*2], rsi + + vmovdqa [_xmm + 16*0], xmm6 + vmovdqa [_xmm + 16*1], xmm7 + vmovdqa [_xmm + 16*2], xmm8 + vmovdqa [_xmm + 16*3], xmm9 + vmovdqa [_xmm + 16*4], xmm10 + vmovdqa [_xmm + 16*5], xmm11 + vmovdqa [_xmm + 16*6], xmm12 + vmovdqa [_xmm + 16*7], xmm13 + vmovdqa [_xmm + 16*8], xmm14 + vmovdqa [_xmm + 16*9], xmm15 +%endif + + mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b + + + vmovdqu xmm1, [T_val] ; read initial Tweak value + vpxor xmm4, xmm4 ; for key expansion + encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys + + +%ifidn __OUTPUT_FORMAT__, win64 + mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer + mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer +%endif + + cmp N_val, 128 + jl _less_than_128_bytes + + vpbroadcastq zpoly, ghash_poly_8b + + cmp N_val, 256 + jge _start_by16 + + cmp N_val, 128 + jge _start_by8 + +_do_n_blocks: + cmp N_val, 0 + je _ret_ + + cmp N_val, (7*16) + jge _remaining_num_blocks_is_7 + + cmp N_val, (6*16) + jge _remaining_num_blocks_is_6 + + cmp N_val, (5*16) + jge _remaining_num_blocks_is_5 + + cmp N_val, (4*16) + jge _remaining_num_blocks_is_4 + + cmp N_val, (3*16) + jge _remaining_num_blocks_is_3 + + cmp N_val, (2*16) + jge _remaining_num_blocks_is_2 + + cmp N_val, (1*16) + jge _remaining_num_blocks_is_1 + +;; _remaining_num_blocks_is_0: + vmovdqa xmm8, xmm0 + vmovdqa xmm0, xmm9 + jmp _steal_cipher + +_remaining_num_blocks_is_7: + mov tmp1, -1 + shr tmp1, 16 + kmovq k1, tmp1 + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4] + add ptr_plaintext, 16*7 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2 + add ptr_ciphertext, 16*7 + + vextracti32x4 xmm8, zmm2, 0x2 + vextracti32x4 xmm0, zmm10, 0x3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_6: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 ymm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*6 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], ymm2 + add ptr_ciphertext, 16*6 + + vextracti32x4 xmm8, zmm2, 0x1 + vextracti32x4 xmm0, zmm10, 0x2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_5: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 16*5 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu [ptr_ciphertext+16*4], xmm2 + add ptr_ciphertext, 16*5 + + movdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm10, 0x1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_4: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + add ptr_plaintext, 16*4 + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1 + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + add ptr_ciphertext, 16*4 + + vextracti32x4 xmm8, zmm1, 0x3 + vextracti32x4 xmm0, zmm10, 0x0 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_3: + vextracti32x4 xmm10, zmm9, 1 + vextracti32x4 xmm11, zmm9, 2 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + vmovdqu xmm3, [ptr_plaintext+16*2] + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + add ptr_ciphertext, 16*3 + + vmovdqa xmm8, xmm3 + vextracti32x4 xmm0, zmm9, 3 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_2: + vextracti32x4 xmm10, zmm9, 1 + vmovdqu xmm1, [ptr_plaintext+16*0] + vmovdqu xmm2, [ptr_plaintext+16*1] + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + add ptr_ciphertext, 16*2 + + vmovdqa xmm8, xmm2 + vextracti32x4 xmm0, zmm9, 2 + and N_val, 15 + je _ret_ + jmp _steal_cipher + +_remaining_num_blocks_is_1: + vmovdqu xmm1, [ptr_plaintext] + add ptr_plaintext, 16 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16 + + vmovdqa xmm8, xmm1 + vextracti32x4 xmm0, zmm9, 1 + and N_val, 15 + je _ret_ + jmp _steal_cipher + + +_start_by16: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + + ; Make next 8 tweek values by all x 2^8 + vpsrldq zmm13, zmm9, 15 + vpclmulqdq zmm14, zmm13, zpoly, 0 + vpslldq zmm11, zmm9, 1 + vpxord zmm11, zmm11, zmm14 + + vpsrldq zmm15, zmm10, 15 + vpclmulqdq zmm16, zmm15, zpoly, 0 + vpslldq zmm12, zmm10, 1 + vpxord zmm12, zmm12, zmm16 + +_main_loop_run_16: + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + vmovdqu8 zmm3, [ptr_plaintext+16*8] + vmovdqu8 zmm4, [ptr_plaintext+16*12] + add ptr_plaintext, 256 + + encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0 + + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + vmovdqu8 [ptr_ciphertext+16*8], zmm3 + vmovdqu8 [ptr_ciphertext+16*12], zmm4 + add ptr_ciphertext, 256 + sub N_val, 256 + + cmp N_val, 256 + jge _main_loop_run_16 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_start_by8: + ; Make first 7 tweek values + vbroadcasti32x4 zmm0, [TW] + vbroadcasti32x4 zmm8, [shufb_15_7] + mov tmp1, 0xaa + kmovq k2, tmp1 + + ; Mult tweak by 2^{3, 2, 1, 0} + vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8 + vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0 + vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8 + vpclmulqdq zmm3, zmm2, zpoly, 0x00 + vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0 + vpxord zmm9, zmm3, zmm4 + + ; Mult tweak by 2^{7, 6, 5, 4} + vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4 + vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4 + vpclmulqdq zmm7, zmm6, zpoly, 0x00 + vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4 + vpxord zmm10, zmm7, zmm5 + +_main_loop_run_8: + ; load plaintext + vmovdqu8 zmm1, [ptr_plaintext+16*0] + vmovdqu8 zmm2, [ptr_plaintext+16*4] + add ptr_plaintext, 128 + + encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0 + + ; store ciphertext + vmovdqu8 [ptr_ciphertext+16*0], zmm1 + vmovdqu8 [ptr_ciphertext+16*4], zmm2 + add ptr_ciphertext, 128 + sub N_val, 128 + + cmp N_val, 128 + jge _main_loop_run_8 + + vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block + jmp _do_n_blocks + +_steal_cipher_next: + ; generate next Tweak value + xor ghash_poly_8b_temp, ghash_poly_8b_temp + shl twtempl, 1 + adc twtemph, twtemph + cmovc ghash_poly_8b_temp, ghash_poly_8b + xor twtempl, ghash_poly_8b_temp + mov [TW], twtempl + mov [TW + 8], twtemph + vmovdqa xmm0, [TW] + +_steal_cipher: + ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak + vmovdqa xmm2, xmm8 + + ; shift xmm8 to the left by 16-N_val bytes + lea twtempl, [vpshufb_shf_table] + vmovdqu xmm10, [twtempl+N_val] + vpshufb xmm8, xmm10 + + vmovdqu xmm3, [ptr_plaintext - 16 + N_val] + vmovdqu [ptr_ciphertext - 16 + N_val], xmm8 + + ; shift xmm3 to the right by 16-N_val bytes + lea twtempl, [vpshufb_shf_table +16] + sub twtempl, N_val + vmovdqu xmm10, [twtempl] + vpxor xmm10, [mask1] + vpshufb xmm3, xmm10 + + vpblendvb xmm3, xmm3, xmm2, xmm10 + + ; xor Tweak value + vpxor xmm8, xmm3, xmm0 + + ;encrypt last block with cipher stealing + vpxor xmm8, [keys] ; ARK + vaesenc xmm8, [keys + 16*1] ; round 1 + vaesenc xmm8, [keys + 16*2] ; round 2 + vaesenc xmm8, [keys + 16*3] ; round 3 + vaesenc xmm8, [keys + 16*4] ; round 4 + vaesenc xmm8, [keys + 16*5] ; round 5 + vaesenc xmm8, [keys + 16*6] ; round 6 + vaesenc xmm8, [keys + 16*7] ; round 7 + vaesenc xmm8, [keys + 16*8] ; round 8 + vaesenc xmm8, [keys + 16*9] ; round 9 + vaesenc xmm8, [keys + 16*10] ; round 10 + vaesenc xmm8, [keys + 16*11] ; round 11 + vaesenc xmm8, [keys + 16*12] ; round 12 + vaesenc xmm8, [keys + 16*13] ; round 13 + vaesenclast xmm8, [keys + 16*14] ; round 14 + + ; xor Tweak value + vpxor xmm8, xmm8, xmm0 + + ; store last ciphertext value + vmovdqu [ptr_ciphertext - 16], xmm8 + +_ret_: + mov rbx, [_gpr + 8*0] + +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [_gpr + 8*1] + mov rsi, [_gpr + 8*2] + + vmovdqa xmm6, [_xmm + 16*0] + vmovdqa xmm7, [_xmm + 16*1] + vmovdqa xmm8, [_xmm + 16*2] + vmovdqa xmm9, [_xmm + 16*3] + vmovdqa xmm10, [_xmm + 16*4] + vmovdqa xmm11, [_xmm + 16*5] + vmovdqa xmm12, [_xmm + 16*6] + vmovdqa xmm13, [_xmm + 16*7] + vmovdqa xmm14, [_xmm + 16*8] + vmovdqa xmm15, [_xmm + 16*9] +%endif + +%ifndef ALIGN_STACK + add rsp, VARIABLE_OFFSET +%else + mov rsp, rbp + pop rbp +%endif + ret + + +_less_than_128_bytes: + cmp N_val, 16 + jb _ret_ + + mov tmp1, N_val + and tmp1, (7 << 4) + cmp tmp1, (6 << 4) + je _num_blocks_is_6 + cmp tmp1, (5 << 4) + je _num_blocks_is_5 + cmp tmp1, (4 << 4) + je _num_blocks_is_4 + cmp tmp1, (3 << 4) + je _num_blocks_is_3 + cmp tmp1, (2 << 4) + je _num_blocks_is_2 + cmp tmp1, (1 << 4) + je _num_blocks_is_1 + +_num_blocks_is_7: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7 + add ptr_plaintext, 16*7 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + vmovdqu [ptr_ciphertext+16*6], xmm7 + add ptr_ciphertext, 16*7 + vmovdqa xmm8, xmm7 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_6: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6 + add ptr_plaintext, 16*6 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + vmovdqu [ptr_ciphertext+16*5], xmm6 + + add ptr_ciphertext, 16*6 + vmovdqa xmm8, xmm6 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_5: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5 + add ptr_plaintext, 16*5 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + vmovdqu [ptr_ciphertext+16*4], xmm5 + + add ptr_ciphertext, 16*5 + vmovdqa xmm8, xmm5 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_4: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4 + add ptr_plaintext, 16*4 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + vmovdqu [ptr_ciphertext+16*3], xmm4 + + add ptr_ciphertext, 16*4 + vmovdqa xmm8, xmm4 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next +_num_blocks_is_3: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3 + add ptr_plaintext, 16*3 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext+16*0], xmm1 + vmovdqu [ptr_ciphertext+16*1], xmm2 + vmovdqu [ptr_ciphertext+16*2], xmm3 + + add ptr_ciphertext, 16*3 + vmovdqa xmm8, xmm3 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_2: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2 + add ptr_plaintext, 16*2 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1 + ; store ciphertext + vmovdqu [ptr_ciphertext], xmm1 + vmovdqu [ptr_ciphertext+16], xmm2 + + add ptr_ciphertext, 16*2 + vmovdqa xmm8, xmm2 + + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +_num_blocks_is_1: + initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1 + add ptr_plaintext, 16*1 + encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1 + vmovdqu [ptr_ciphertext], xmm1 + add ptr_ciphertext, 16*1 + vmovdqa xmm8, xmm1 + and N_val, 15 ; N_val = N_val mod 16 + je _ret_ + jmp _steal_cipher_next + +section .data +align 16 + +vpshufb_shf_table: +; use these values for shift constants for the vpshufb instruction +; different alignments result in values as shown: +; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 +; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 +; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 +; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 +; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 +; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 +; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 +; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 +; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 +; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 +; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 +; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 +; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 +; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 +; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 +dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 +dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +mask1: +dq 0x8080808080808080, 0x8080808080808080 + +const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3 +const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5 +const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7 +const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1 + +shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_XTS_AES_256_enc_vaes +no_XTS_AES_256_enc_vaes: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S new file mode 100644 index 000000000..7214f0f25 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S @@ -0,0 +1,215 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "gcm_common_128.S" +/* + void gist_aes_gcm_enc_finalize_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *auth_tag, \ + uint64_t auth_tag_len \ + ) +*/ + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg auth_tag ,2 + declare_var_generic_reg auth_tag_len ,3 + declare_var_generic_reg partial_block_len ,4 + declare_var_generic_reg partial_block ,1 + + declare_var_generic_reg hashkey_addr ,0 + declare_var_generic_reg temp0, 6 + + declare_var_vector_reg OrigIV ,0 + declare_var_vector_reg AadHash ,1 + declare_var_vector_reg HashKey0 ,2 + declare_var_vector_reg HashKey0Ext ,3 + declare_var_vector_reg High ,4 + declare_var_vector_reg Low ,5 + declare_var_vector_reg Middle0 ,6 + declare_var_vector_reg Len ,7 + declare_var_vector_reg Tmp0 ,8 + declare_var_vector_reg Tmp1 ,9 + declare_var_vector_reg Zero ,10 + declare_var_vector_reg Poly ,11 + declare_var_vector_reg PartitialBlock ,13 + + declare_var_vector_reg Tmp2 ,31 + declare_var_vector_reg Tmp3 ,12 + + .set stack_size,48 + .macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + .endm + + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8, d9, [sp], stack_size + .endm +START_FUNC(enc,KEY_LEN,_finalize_) +START_FUNC(dec,KEY_LEN,_finalize_) + ldr partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF] + load_aes_keys key_data + push_stack + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + ldr qOrigIV,[context,ORIG_IV_OFF] /* OrigIV */ + ldp qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF /* Len , context move to partial block*/ + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + /* complete part */ + cbnz partial_block_len,10f + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + shl vLen.2d,vLen.2d,3 /* Len */ + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + aes_encrypt_round OrigIV,Key4 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key5 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key7 + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key8 + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + aese vOrigIV.16b,vKey9.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vOrigIV.16b,vOrigIV.16b,vKey10.16b + rbit vAadHash.16b,vOrigIV.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +10: /* cbnz partial_block_len,10f */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32] + aes_encrypt_round OrigIV,Key0 + read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key4 + rbit vAadHash.16b,vPartitialBlock.16b + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key5 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + eor vHigh.16b,vHigh.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key7 + eor vLow.16b,vLow.16b,vTmp1.16b + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key8 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aese vOrigIV.16b,vKey9.16b + pmull vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + eor vOrigIV.16b,vOrigIV.16b,vKey10.16b + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + rbit vAadHash.16b,vOrigIV.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +END_FUNC(enc,KEY_LEN,_finalize_) +END_FUNC(dec,KEY_LEN,_finalize_) + + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S new file mode 100644 index 000000000..9eda7178e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S @@ -0,0 +1,220 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "gcm_common_256.S" +/* + void gist_aes_gcm_enc_finalize_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *auth_tag, \ + uint64_t auth_tag_len \ + ) +*/ + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg auth_tag ,2 + declare_var_generic_reg auth_tag_len ,3 + declare_var_generic_reg partial_block_len ,4 + declare_var_generic_reg partial_block ,1 + + declare_var_generic_reg hashkey_addr ,0 + declare_var_generic_reg temp0 ,6 + + declare_var_vector_reg OrigIV ,0 + declare_var_vector_reg AadHash ,1 + declare_var_vector_reg HashKey0 ,2 + declare_var_vector_reg HashKey0Ext ,3 + declare_var_vector_reg High ,4 + declare_var_vector_reg Low ,5 + declare_var_vector_reg Middle0 ,6 + declare_var_vector_reg Len ,7 + declare_var_vector_reg Tmp0 ,8 + declare_var_vector_reg Tmp1 ,9 + declare_var_vector_reg Zero ,10 + declare_var_vector_reg Poly ,11 + declare_var_vector_reg PartitialBlock ,13 + + declare_var_vector_reg Tmp2 ,31 + declare_var_vector_reg Tmp3 ,12 + + .set stack_size,48 + .macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + .endm + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8, d9, [sp], stack_size + .endm + +START_FUNC(enc,KEY_LEN,_finalize_) +START_FUNC(dec,KEY_LEN,_finalize_) + ldr partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF] + load_aes_keys key_data + push_stack + + ldr qOrigIV,[context,ORIG_IV_OFF] /* OrigIV */ + ldp qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF /* Len , context move to partial block*/ + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + /* complete part */ + cbnz partial_block_len,10f + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + shl vLen.2d,vLen.2d,3 /* Len */ + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + aes_encrypt_round OrigIV,Key4 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key5 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key7 + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key8 + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key9 + aes_encrypt_round OrigIV,Key10 + aes_encrypt_round OrigIV,Key11 + aes_encrypt_round OrigIV,Key12 + aese vOrigIV.16b,vKey13.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vOrigIV.16b,vOrigIV.16b,vKey14.16b + rbit vAadHash.16b,vOrigIV.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +10: /* cbnz partial_block_len,10f */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32] + aes_encrypt_round OrigIV,Key0 + read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round OrigIV,Key2 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key3 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round OrigIV,Key4 + rbit vAadHash.16b,vPartitialBlock.16b + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32] + aes_encrypt_round OrigIV,Key5 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key6 + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + eor vHigh.16b,vHigh.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key7 + eor vLow.16b,vLow.16b,vTmp1.16b + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + rev64 vLen.16b,vLen.16b /* Len */ + aes_encrypt_round OrigIV,Key8 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + pmull vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round OrigIV,Key9 + rbit vAadHash.16b,vLen.16b /* Len */ + ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32] + aes_encrypt_round OrigIV,Key10 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round OrigIV,Key11 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d + aes_encrypt_round OrigIV,Key12 + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d + aese vOrigIV.16b,vKey13.16b + eor vHigh.16b,vHigh.16b,vTmp0.16b + eor vLow.16b ,vLow.16b ,vTmp1.16b + pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d + pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b + eor vOrigIV.16b,vOrigIV.16b,vKey14.16b + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b + rbit vAadHash.16b,vOrigIV.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + rbit vAadHash.16b,vAadHash.16b /* Aad */ + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret + +END_FUNC(enc,KEY_LEN,_finalize_) +END_FUNC(dec,KEY_LEN,_finalize_) + + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S new file mode 100644 index 000000000..0dd94c6b7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S @@ -0,0 +1,161 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "gcm_common.S" +/* +void gist_aes_gcm_init_##mode( + const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t *iv, + uint8_t const *aad, + uint64_t aad_len + ); +*/ + key_data .req x0 + context .req x1 + iv .req x2 + aad .req x3 + aad_len .req x4 + temp0 .req x7 + wtemp0 .req w7 + temp1 .req x6 + left_len .req x5 + aad_left .req x2 + small_tbl_adr .req x6 + + hashkey_base .req x0 + hashkey_addr .req x2 + + declare_var_vector_reg AadHash,0 + declare_var_vector_reg Dat0,1 + declare_var_vector_reg HashKey0,2 + declare_var_vector_reg HashKey0Ext,3 + declare_var_vector_reg High,4 + declare_var_vector_reg Middle0,5 + declare_var_vector_reg Low,6 + declare_var_vector_reg LeftDat,7 + declare_var_vector_reg Zero,16 + declare_var_vector_reg Poly,17 + + declare_var_vector_reg Tmp0,18 + declare_var_vector_reg Tmp1,19 + declare_var_vector_reg Ctr,1 + + +START_FUNC(init,128,_) +START_FUNC(init,192,_) +START_FUNC(init,256,_) + stp aad_len,xzr,[context,AAD_LEN_OFF] //save in_length and aad_length + str xzr,[context,PARTIAL_BLOCK_LENGTH_OFF] //clear partial_block_length + add hashkey_base,key_data,HASHKEY_BASE_OFF + /* Init Consts for ghash */ + movi vZero.4s,0 + mov temp0,0x87 + dup vPoly.2d,temp0 + /* Set orig_IV */ + ldr wtemp0,[iv,8] + ldr temp1,[iv] + movk temp0,0x100,lsl 48 + stp temp1,temp0,[context,ORIG_IV_OFF] + and left_len,aad_len,15 + ldp qHashKey0,qHashKey0Ext,[key_data,(HASHKEY_TOTAL_NUM-1)*32] + /* Set current_counter, save as cpu order */ + ldr qCtr,[context,ORIG_IV_OFF] + rev32 vCtr.16b,vCtr.16b + str qCtr,[context,CTR_OFF] + cbz aad_len,init_zero_exit + lsr aad_len,aad_len,4 + /* Read small data */ + cbz left_len,2f + add aad_left,aad,aad_len,lsl 4 + read_small_data_start LeftDat,aad_left,left_len,small_tbl_adr,Tmp0 + cbz aad_len,24f // aad_len less than 16 +2: + cbnz left_len,1f + /*left_len == 0 && aad_len !=0 */ + + sub aad_len,aad_len,1 + /* leftDat = aad[-1] */ + ldr qLeftDat,[aad,aad_len,lsl 4] + cbz aad_len,24f /* aad_len == 16 */ +1: + /* aad_len > 16 */ + ldr qAadHash,[aad],16 + rbit vAadHash.16b,vAadHash.16b + sub aad_len,aad_len,1 +1: + /* loop ghash_block */ + cmp aad_len,HASHKEY_TOTAL_NUM - 1 + bls 1f /* break loop */ + sub aad_len,aad_len,HASHKEY_TOTAL_NUM + ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbz aad_len,23f /* left aad_len == 0 */ + mov temp0,HASHKEY_TOTAL_NUM - 1 + sub temp0,temp0,aad_len + add hashkey_addr,hashkey_base,temp0,lsl 5 + sub aad_len,aad_len,1 + + + ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */ +1: + cbz aad_len,1f + ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Tmp1,Dat0, 2 + + sub aad_len,aad_len,1 + b 1b +1: + ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1 + rbit vAadHash.16b, vLeftDat.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + str qAadHash,[context] + ret + +23: + ghash_block_reg AadHash,LeftDat, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0 + str qAadHash,[context] + ret +24: /* less or equal than 16 */ + rbit vLeftDat.16b, vLeftDat.16b + str qLeftDat,[context] + ret +init_zero_exit: + stp xzr,xzr,[context] + ret +END_FUNC(init,128,_) +END_FUNC(init,192,_) +END_FUNC(init,256,_) + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S new file mode 100644 index 000000000..c4e8ef59c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S @@ -0,0 +1,140 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a + .section .rodata +#define CONST_VAR_START(a) \ + .align 3;.global a;.type a, %object;a + +#define CONST_VAR_END(a) \ + .size a,. - a +CONST_VAR_START(shift_small_data_table): + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +CONST_VAR_START(read_small_data_table): + .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +CONST_VAR_END(shift_small_data_table) + .byte 0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff +CONST_VAR_START(write_small_data_table): + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff +CONST_VAR_END(read_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0x04,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0x0c,0xff + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff +CONST_VAR_START(read_end_small_data_table): + .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff +CONST_VAR_END(write_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d + .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e + .byte 0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b + .byte 0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e + .byte 0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d +CONST_VAR_START(write_end_small_data_table): + .byte 0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e +CONST_VAR_END(read_end_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff + .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff + .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff + .byte 0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff + .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff + .byte 0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff + .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff +CONST_VAR_START(tbx_end_small_data_table): + .byte 0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff +CONST_VAR_END(write_end_small_data_table) + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f +CONST_VAR_START(tbx_start_small_data_table): + .byte 0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f +CONST_VAR_END(tbx_end_small_data_table) + .byte 0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f +CONST_VAR_END(tbx_start_small_data_table) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S new file mode 100644 index 000000000..9f1ff80fb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_128.S" +#include "gcm_enc_dec.S" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S new file mode 100644 index 000000000..f3cc2b802 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_256.S" +#include "gcm_enc_dec.S" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S new file mode 100644 index 000000000..e635d7e70 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_128.S" +#include "gcm_precomp.S"
\ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S new file mode 100644 index 000000000..52b76a6a2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S @@ -0,0 +1,30 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_256.S" +#include "gcm_precomp.S"
\ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S new file mode 100644 index 000000000..42c48d9a0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S @@ -0,0 +1,32 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_128.S" +#include "gcm_update.S" + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S new file mode 100644 index 000000000..1c2c33b48 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S @@ -0,0 +1,32 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "gcm_common_256.S" +#include "gcm_update.S" + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c new file mode 100644 index 000000000..1a2077356 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c @@ -0,0 +1,108 @@ +/********************************************************************** + Copyright(c) 2020-2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +static unsigned long is_crypto_available(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES); +} + +#define DEFINE_CBC_INTERFACE_DISPATCHER(func,mode,suffix) \ + DEFINE_INTERFACE_DISPATCHER(aes_cbc_##func##_##mode) \ + { \ + if (is_crypto_available()) \ + return PROVIDER_INFO(aes_cbc_##func##_##mode##_##suffix); \ + return PROVIDER_BASIC(aes_cbc_##func##_##mode); \ + } + +DEFINE_CBC_INTERFACE_DISPATCHER(enc, 128, aes); +DEFINE_CBC_INTERFACE_DISPATCHER(enc, 192, aes); +DEFINE_CBC_INTERFACE_DISPATCHER(enc, 256, aes); + +/* + * AES-CBC decryption can be parallelised according to algorithm. Decryption + * flow is to do decrypt and then EOR previous input data or IV(first). + * So, decryption can be parallelised and EOR all data as output data. + * + * The unroll factor depends on micro architecture. The factors of N1, A57 and A72 + * are based on optimization guide and test results. Other platforms are based on + * ThunderX2 test results. + * + */ +DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_128) +{ + if (is_crypto_available()) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(aes_cbc_dec_128_aes_1); + case MICRO_ARCH_ID(ARM, CORTEX_A57): + return PROVIDER_INFO(aes_cbc_dec_128_aes_4); + case MICRO_ARCH_ID(ARM, CORTEX_A72): + return PROVIDER_INFO(aes_cbc_dec_128_aes_6); + } + return PROVIDER_INFO(aes_cbc_dec_128_aes_5); + } + return PROVIDER_BASIC(aes_cbc_dec_128); +} + +DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_192) +{ + if (is_crypto_available()) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(aes_cbc_dec_192_aes_1); + case MICRO_ARCH_ID(ARM, CORTEX_A57): + return PROVIDER_INFO(aes_cbc_dec_192_aes_5); + case MICRO_ARCH_ID(ARM, CORTEX_A72): + return PROVIDER_INFO(aes_cbc_dec_192_aes_4); + } + return PROVIDER_INFO(aes_cbc_dec_192_aes_5); + } + return PROVIDER_BASIC(aes_cbc_dec_192); +} + +DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_256) +{ + if (is_crypto_available()) { + switch (get_micro_arch_id()) { + case MICRO_ARCH_ID(ARM, NEOVERSE_N1): + return PROVIDER_INFO(aes_cbc_dec_256_aes_1); + case MICRO_ARCH_ID(ARM, CORTEX_A57): + return PROVIDER_INFO(aes_cbc_dec_256_aes_5); + case MICRO_ARCH_ID(ARM, CORTEX_A72): + return PROVIDER_INFO(aes_cbc_dec_256_aes_6); + } + return PROVIDER_INFO(aes_cbc_dec_256_aes_5); + } + return PROVIDER_BASIC(aes_cbc_dec_256); +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S new file mode 100644 index 000000000..6f793843a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S @@ -0,0 +1,54 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#define FN_NAME(fn,mode,post) aes_cbc_##fn##_##mode##_##post +#define LABEL_NAME(fn,mode,post) .L##fn##_##mode##_##post +#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \ + .type FN_NAME(fn,mode,post), %function; \ + FN_NAME(fn,mode,post): +#define END_FUNC(fn,mode,post) .size FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post) +.macro declare_var_vector_reg name:req,reg:req +.ifdef q\name + .unreq q\name + .unreq v\name + .unreq s\name + .unreq d\name +.endif + .set q\name , \reg + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg + d\name .req d\reg +.endm + +.macro declare_var_generic_reg name:req,reg:req + \name .req x\reg + x\name .req x\reg + w\name .req w\reg +.endm
\ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S new file mode 100644 index 000000000..11bd90a71 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S @@ -0,0 +1,482 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text +#include "cbc_common.S" + .altmacro +.macro _aes_decrypt_round block:req,key:req + aesd v\block\().16b,vKey\key\().16b + .if \key < 13 + aesimc v\block\().16b,v\block\().16b + .endif + .if \key > 13 + .error "erro her" + .endif +.endm + +.macro aes_decrypt_round block,reg,key + _aes_decrypt_round In\reg\()_\block,\key +.endm + +.macro load_keys first_key + .if \first_key == 4 + ld1 {vKey4.4s -vKey6.4s},[keys],3*16 + .endif + .ifc 2 , \first_key + ldr qKey2,[keys],1*16 + ld1 {vKey3.16b -vKey6.16b},[keys],4*16 + .endif + .ifc 0 , \first_key + ld1 {vKey0.16b -vKey2.16b},[keys],3*16 + ld1 {vKey3.16b -vKey6.16b},[keys],4*16 + .endif + ld1 {vKey7.16b -vKey10.16b},[keys],4*16 + ld1 {vKey11.16b-vKey14.16b},[keys],4*16 +.endm + +.macro aes_decrypt_blocks_round blocks,key_idx,key_reg,next_keyreg,first_idx + .if \key_idx == 12 + ldr q\next_keyreg,[keys],(\first_idx-13)*16 + .else + ldr q\next_keyreg,[keys],16 + .endif + n=0 + .rept \blocks + _aes_decrypt_round %n,\key_reg + n=n+1 + .endr +.endm + +.macro aes_decrypt_rounds blocks,key_st,key_end,first_idx + j=key_st + .rept \key_end - \key_st + 1 + aes_decrypt_blocks_round \blocks,%j,%(j%2),%((j+1)%2),\first_idx + j=j+1 + .endr +.endm + +.macro aes_cbc_decrypt_rounds blocks,first_idx,reg,next_reg + aes_decrypt_rounds \blocks,\first_idx,12,\first_idx +.endm + +.macro declare_prefix idx,reg,prefix + declare_var_vector_reg \prefix\()\idx,\reg +.endm + +.macro mldr reg,block,addr + ldr qIn\reg\()_\block,[\addr],16 +.endm + +.macro mldrin reg,blocks,addr + .if \blocks == 1 + ldr qIn\reg\()_0,[\addr],16 + .exitm + .endif + .if \blocks == 2 + ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + .exitm + .endif + .if \blocks == 3 + ldr qIn\reg\()_0,[\addr],16 + ldp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16 + .exitm + .endif + .if \blocks == 4 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 5 + ldr qIn\reg\()_0,[\addr],16 + ld1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 6 + ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + ld1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 7 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16 + ld1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16 + .exitm + .endif + + .if \blocks == 8 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 9 + ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + ldr qIn\reg\()_8,[\addr],16 + .exitm + .endif +.endm + +.macro mstrout reg,blocks,addr + .if \blocks == 1 + str qIn\reg\()_0,[\addr],16 + .exitm + .endif + .if \blocks == 2 + stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + .exitm + .endif + .if \blocks == 3 + str qIn\reg\()_0,[\addr],16 + stp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16 + .exitm + .endif + .if \blocks == 4 + st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 5 + str qIn\reg\()_0,[\addr],16 + st1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 6 + stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16 + st1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 7 + st1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16 + st1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16 + .exitm + .endif + + .if \blocks == 8 + st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + .exitm + .endif + .if \blocks == 9 + st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16 + st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16 + str qIn\reg\()_8,[\addr],16 + .exitm + .endif +.endm + +.macro eorkey14 block,reg + eor vBlock\block\().16b,vKey14.16b,vState\reg\()_\block\().16b +.endm + +.macro eorblock block,reg + eor vIn\reg\()_\block\().16b,vBlock\block\().16b,vIn\reg\()_\block\().16b +.endm + +.macro movstate0 block,reg + mov vState\reg\()_0.16b,vIn\reg\()_\block\().16b +.endm + +.macro cbc_decrypt_rounds blocks,reg,first_key,cur_blocks + .ifb \cur_blocks + _blocks=\blocks + .else + _blocks=\cur_blocks + .endif + key=\first_key + 1 + .if 3*\blocks+1 >= 32-15+\first_key + ldr_key %key,\first_key + .endif + n=0 + .rept _blocks - 1 + eorkey14 %((n+1)%_blocks),\reg + aes_decrypt_round %n,\reg,\first_key + n=n+1 + .endr + eorkey14 0,\reg + movstate0 %(_blocks-1),\reg + aes_decrypt_round %n,\reg,\first_key + + k=0 + .rept 15-\first_key-3 + n=0 + .if 3*\blocks+1 >= 32-15+\first_key + ldr_key %(key+k+1),\first_key + .endif + + .rept _blocks + aes_decrypt_round %n,\reg,%(key+k) + n=n+1 + .endr + k=k+1 + .endr + n=0 + .if 3*\blocks+1 >= 32-15+\first_key + ldr_key \first_key,\first_key + .endif + .rept _blocks + aes_decrypt_round %n,\reg,13 + eorblock %n,\reg + n=n+1 + .endr +.endm + +.macro print_macro a,b,c,d,e + .print "print_macro,\a \b \c \d \e" +.endm + +.macro remainder_process blocks,first_key,curblk +.if \blocks > (1<<\curblk) + tbz xlen_remainder,\curblk,1f + mldrin 0,%(1<<\curblk),in + cbc_decrypt_rounds \blocks,0,\first_key,%(1<<\curblk) + mstrout 0,%(1<<\curblk),out +1: +.endif +.endm + +.macro aes_cbc_decrypt_blocks first_key,blocks + division \blocks, len_bytes,len_remainder,tmp0,tmp1 + mov xlen_quotient_in,xlen_quotient + /* + input regs(2*\block) + tmp regs(\blocks) + State reg(1) + + key regs(15-\first_key) < 32 + */ + .if 3*\blocks+1 < 32-15+\first_key + n=\first_key + .rept 15-\first_key + declare_prefix %n,%(n+17),Key + n=n+1 + .endr + load_keys \first_key + .else + n=\first_key + .rept 14-\first_key + declare_prefix %n,%((n%2)+29),Key + n=n+1 + .endr + declare_prefix 14,31,Key + /* load first key */ + ldr_key \first_key,\first_key + /* load last key */ + ldr_key 14,\first_key + .endif + m=\blocks + l=\blocks-1 + declare_prefix 0,0,State0_ + declare_prefix 0,0,State1_ + n=0 + .rept \blocks + declare_prefix %n,%(n+1),In0_ + declare_prefix %n,%(n+m+1),In1_ + declare_prefix %n,%(n+2*m+1),Block + n=n+1 + .endr + n=1 + .rept \blocks -1 + declare_prefix %n,%(n),State0_ + declare_prefix %n,%(n+m),State1_ + n=n+1 + .endr + ldr qState0_0,[IV] + cbz xlen_quotient,9f + mldrin 0,\blocks,in + sub xlen_quotient_in,xlen_quotient_in,1 + b 5f + +3: + sub xlen_quotient,xlen_quotient,1 + mstrout 1,\blocks,out + cbz xlen_quotient,9f +5: + cbz xlen_quotient_in,1f + mldrin 1,\blocks,in + sub xlen_quotient_in,xlen_quotient_in,1 +1: + cbc_decrypt_rounds \blocks,0,\first_key + sub xlen_quotient,xlen_quotient,1 + mstrout 0,\blocks,out + cbz xlen_quotient,9f + + cbz xlen_quotient_in,1f + mldrin 0,\blocks,in + sub xlen_quotient_in,xlen_quotient_in,1 +1: + cbc_decrypt_rounds \blocks,1,\first_key + b 3b +9: + remainder_process \blocks,\first_key,3 + remainder_process \blocks,\first_key,2 + remainder_process \blocks,\first_key,1 + remainder_process \blocks,\first_key,0 +.endm + + +.macro division blocks,quotient,remainder,tmp0,tmp1 + .if \blocks == 1 + mov x\remainder, 0 + .exitm + .endif + .if \blocks == 2 + and x\remainder, x\quotient, 1 + lsr x\quotient, x\quotient, 1 + .exitm + .endif + .if \blocks == 3 + mov x\tmp0, -6148914691236517206 + mov x\remainder, x\quotient + movk x\tmp0, 0xaaab, lsl 0 + umulh x\tmp0, x\quotient, x\tmp0 + and x\tmp1, x\tmp0, -2 + lsr x\quotient, x\tmp0, 1 + add x\tmp1, x\tmp1, x\quotient + sub x\remainder, x\remainder, x\tmp1 + .exitm + .endif + .if \blocks == 4 + and x\remainder, x\quotient, 3 + lsr x\quotient, x\quotient, 2 + .exitm + .endif + .if \blocks == 5 + mov x\tmp0, -3689348814741910324 + mov x\remainder, x\quotient + movk x\tmp0, 0xcccd, lsl 0 + umulh x\tmp0, x\quotient, x\tmp0 + and x\tmp1, x\tmp0, -4 + lsr x\quotient, x\tmp0, 2 + add x\tmp1, x\tmp1, x\quotient + sub x\remainder, x\remainder, x\tmp1 + .exitm + .endif + .if \blocks == 6 + mov x\tmp0, -6148914691236517206 + mov x\tmp1, x\quotient + movk x\tmp0, 0xaaab, lsl 0 + umulh x\tmp0, x\quotient, x\tmp0 + lsr x\quotient, x\tmp0, 2 + add x\remainder, x\quotient, x\quotient, lsl 1 + sub x\remainder, x\tmp1, x\remainder, lsl 1 + .exitm + .endif + .if \blocks == 7 + mov x\tmp0, 9363 + mov x\tmp1, x\quotient + movk x\tmp0, 0x9249, lsl 16 + movk x\tmp0, 0x4924, lsl 32 + movk x\tmp0, 0x2492, lsl 48 + umulh x\quotient, x\quotient, x\tmp0 + sub x\tmp0, x\tmp1, x\quotient + add x\tmp0, x\quotient, x\tmp0, lsr 1 + lsr x\quotient, x\tmp0, 2 + lsl x\remainder, x\quotient, 3 + sub x\remainder, x\remainder, x\quotient + sub x\remainder, x\tmp1, x\remainder + .exitm + .endif + .if \blocks == 8 + and x\remainder, x\quotient, 7 + lsr x\quotient, x\quotient, 3 + .exitm + .endif + .if \blocks == 9 + mov x\tmp0, 58255 + mov x\remainder, x\quotient + movk x\tmp0, 0x8e38, lsl 16 + movk x\tmp0, 0x38e3, lsl 32 + movk x\tmp0, 0xe38e, lsl 48 + umulh x\tmp0, x\quotient, x\tmp0 + and x\tmp1, x\tmp0, -8 + lsr x\quotient, x\tmp0, 3 + add x\tmp1, x\tmp1, x\quotient + sub x\remainder, x\remainder, x\tmp1 + .exitm + .endif +.endm + +.macro ldr_key num,first_key + ldr qKey\num,[keys,16*(\num - \first_key)] +.endm +#ifndef CBC_DECRYPT_BLOCKS_NUM +#define CBC_DECRYPT_BLOCKS_NUM 8 +#endif + +.macro cbc_decrypt first_key:req,blocks + lsr xlen_bytes,xlen_bytes,4 + cbz xlen_bytes,10f + push_stack + aes_cbc_decrypt_blocks \first_key,\blocks + pop_stack +10: +.endm + +.set stack_size,64 +.macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] +.endm + +.macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], stack_size +.endm + +/* +void aes_cbc_dec_128( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); +*/ + declare_var_generic_reg in ,0 + declare_var_generic_reg IV ,1 + declare_var_generic_reg keys ,2 + declare_var_generic_reg out ,3 + declare_var_generic_reg len_bytes ,4 + declare_var_generic_reg len_quotient,4 + declare_var_generic_reg len_remainder,5 + declare_var_generic_reg tmp0 ,6 + declare_var_generic_reg tmp1 ,7 + declare_var_generic_reg len_quotient_in,6 + +.macro define_aes_cbc_dec_func mode:req,blocks:req + .global aes_cbc_dec_\mode\()_aes_\blocks +aes_cbc_dec_\mode\()_aes_\blocks: + cbc_decrypt %((256-mode)/32),\blocks + ret + .size aes_cbc_dec_\mode\()_aes_\blocks, . - aes_cbc_dec_\mode\()_aes_\blocks +.endm + +.irp blocks,1,2,3,4,5,6,7,8,9 + define_aes_cbc_dec_func 128,\blocks + define_aes_cbc_dec_func 192,\blocks + define_aes_cbc_dec_func 256,\blocks +.endr diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S new file mode 100644 index 000000000..8eb5e507d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S @@ -0,0 +1,157 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "cbc_common.S" + + declare_var_vector_reg Key0 ,17 + declare_var_vector_reg Key1 ,18 + declare_var_vector_reg Key2 ,19 + declare_var_vector_reg Key3 ,20 + declare_var_vector_reg Key4 ,21 + declare_var_vector_reg Key5 ,22 + declare_var_vector_reg Key6 ,23 + declare_var_vector_reg Key7 ,24 + declare_var_vector_reg Key8 ,25 + declare_var_vector_reg Key9 ,26 + declare_var_vector_reg Key10 ,27 + declare_var_vector_reg Key11 ,28 + declare_var_vector_reg Key12 ,29 + declare_var_vector_reg Key13 ,30 + declare_var_vector_reg Key14 ,31 + +.macro aes_encrypt_round block,key + aese v\block\().16b,vKey\key\().16b + .if \key < 13 + aesmc v\block\().16b,v\block\().16b + .endif +.endm + +.macro aes_encrypt_round_name block,key + aese v\block\().16b,v\key\().16b + aesmc v\block\().16b,v\block\().16b +.endm + + + +.set stack_size,64 +.macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] +.endm + +.macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], stack_size +.endm +/* +void aes_cbc_dec_128( + void *in, //!< Input cipher text + uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary + uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data + void *out, //!< Output plain text + uint64_t len_bytes //!< Must be a multiple of 16 bytes + ); +*/ + declare_var_generic_reg in ,0 + declare_var_generic_reg IV ,1 + declare_var_generic_reg keys ,2 + declare_var_generic_reg out ,3 + declare_var_generic_reg len_bytes ,4 + + declare_var_vector_reg State ,0 + declare_var_vector_reg FirstKey ,1 + declare_var_vector_reg Block ,2 + declare_var_vector_reg ConstKey ,3 +.macro load_key num + ldr qKey\num,[keys],16 +.endm +.altmacro +.macro cbc_encrypt first:req + lsr xlen_bytes,xlen_bytes,4 + cbz xlen_bytes,3f + ldr qState,[IV] + ldr qKey\first,[keys],16 + .set lastkey_off,13-\first + ldr qKey14,[keys,lastkey_off*16] + ldr qBlock,[in],16 + n=\first + second=1+\first + .rept 5-n + n=n+1 + load_key %n + .endr + ld1 {vKey6.4s - vKey9.4s},[keys],4*16 + eor vBlock.16b,vBlock.16b ,vState.16b + eor vConstKey.16b,vKey\first\().16b,vKey14.16b + aes_encrypt_round Block,\first + ld1 {vKey10.4s - vKey13.4s},[keys] + b 1f +2: + aes_encrypt_round Block,\first + str qState,[out],16 +1: + sub xlen_bytes,xlen_bytes,1 + aes_encrypt_round Block,%second + cbz xlen_bytes,1f + ldr qKey\first,[in],16 +1: + n=second + .rept 12-n + n=n+1 + aes_encrypt_round Block,%n + .endr + + eor vKey\first\().16b,vKey\first\().16b,vConstKey.16b + aes_encrypt_round Block,13 + eor vState.16b,vBlock.16b,vKey14.16b + cbnz xlen_bytes,2b + str qState,[out] +3: + +.endm +START_FUNC(enc,128,aes) + cbc_encrypt 4 + ret +END_FUNC(enc,128,aes) + +START_FUNC(enc,192,aes) + cbc_encrypt 2 + ret +END_FUNC(enc,192,aes) + +START_FUNC(enc,256,aes) + cbc_encrypt 0 + ret +END_FUNC(enc,256,aes)
\ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S new file mode 100644 index 000000000..fba533754 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S @@ -0,0 +1,38 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface aes_cbc_dec_128 +mbin_interface aes_cbc_dec_192 +mbin_interface aes_cbc_dec_256 + +mbin_interface aes_cbc_enc_128 +mbin_interface aes_cbc_enc_192 +mbin_interface aes_cbc_enc_256 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c new file mode 100644 index 000000000..f8188e3ae --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c @@ -0,0 +1,255 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +static unsigned long is_crypto_available(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + return (auxval & (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL)) == + (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL); +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_precomp_128_aes); + + return PROVIDER_BASIC(aes_gcm_precomp_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_precomp_256_aes); + + return PROVIDER_BASIC(aes_gcm_precomp_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_update_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_update_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_update_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_update_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_update); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_finalize) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_finalize_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_finalize); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_256) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_init_256_aes); + + return PROVIDER_BASIC(aes_gcm_init_256); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_128) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_init_128_aes); + + return PROVIDER_BASIC(aes_gcm_init_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_128_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_128_update_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_128_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_128_update_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_enc_256_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_enc_256_update_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_nt); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update_nt) +{ + if (is_crypto_available()) + return PROVIDER_INFO(aes_gcm_dec_256_update_nt_aes); + + return PROVIDER_BASIC(aes_gcm_dec_256_update_nt); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S new file mode 100644 index 000000000..042f6cf19 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S @@ -0,0 +1,430 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text +#define HASHKEY_TOTAL_NUM (24) +#define HASHKEY_BASE_OFF (15*16) +#define HASHKEY_OFF(n) ((15*16)+n*32) +#define HASHKEY_EXT_OFF(n) ((15*16)+n*32+16) +#ifndef KEY_LEN +#define KEY_LEN 128 +#endif +#ifndef BLOCKS +#define BLOCKS 24 +#endif +#define FN_NAME(fn,mode,post) aes_gcm_##fn##_##mode####post##aes +#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \ + .type FN_NAME(fn,mode,post), %function; \ + FN_NAME(fn,mode,post): +#define END_FUNC(fn,mode,post) .size FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post) + +#define AAD_LEN_OFF 16 +#define IN_LENGTH_OFF 24 +#define PARTIAL_BLOCK_ENC_KEY_OFF 32 +#define PARTIAL_BLOCK_LENGTH_OFF 80 +#define CTR_OFF 64 +#define ORIG_IV_OFF 48 +/* + [low,middle,tmp0,high] +=dat0 * [hashkey0,hashkey0_ext] + ifnb dat1 + dat1=rbit(*dat_adr) + [hashkey0,hashkey0_ext] = *hashkey_adr + dat_adr+=16 + hashkey_adr+=32 +*/ + +.macro ghash_mult_round aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req, \ + tmp0:req,tmp1:req,next_dat:req,left_count:req + + ldr q\next_dat,[\dat_adr],16 + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr,16] + .endif + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp1\().16b + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],32 + .endif + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + rbit v\aadhash\().16b, v\next_dat\().16b +.endm + +.macro ghash_mult_init_round aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req, \ + high:req,low:req,middle:req,tmp0:req,next_dat:req,left_count:req + ldp q\hashkey0,q\hashkey0_ext,[\hashkey_adr],32 + ldr q\next_dat,[\dat_adr],16 + pmull v\middle\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr,16] + .endif + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + + pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],32 + .endif + rbit v\aadhash\().16b, v\next_dat\().16b +.endm + +/* aadhash=reduction(low,middle,high)+dat0 */ +.macro ghash_mult_final_round aadhash:req, \ + high:req,low:req,middle:req,tmp0:req, \ + zero:req,poly:req + + ext v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8 /*high*/ + ext v\middle\().16b,v\zero\().16b,v\middle\().16b,8 /*low */ + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\middle\().16b + + pmull2 v\middle\().1q,v\high\().2d,v\poly\().2d + + ext v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8 /*high*/ + ext v\middle\().16b,v\zero\().16b,v\middle\().16b,8 /*low*/ + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\middle\().16b + pmull v\middle\().1q,v\high\().1d,v\poly\().1d + eor v\tmp0\().16b, v\low\().16b, v\middle\().16b + eor v\aadhash\().16b, v\aadhash\().16b, v\tmp0\().16b +.endm +.macro ghash_reset_hashkey_addr hashkey_addr:req,hashkey_base:req,count:req + add \hashkey_addr,\hashkey_base,(24-\count)<<5 +.endm + + +.macro ghash_block_n count:req,aadhash:req, dat:req,dat_addr:req, hashkey_addr:req, hashkey_base:req, \ + hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \ + tmp0:req,tmp1:req + + ghash_reset_hashkey_addr \hashkey_addr,\hashkey_base,\count + ghash_mult_init_round \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \ + \high,\low,\middle,\tmp0,\dat,\count + .set left_count,\count - 1 + .rept left_count + ghash_mult_round \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \ + \high,\low,\middle,\tmp0,\tmp1,\dat, left_count + .set left_count,left_count - 1 + + .endr + ghash_mult_final_round \aadhash,\high,\low,\middle,\tmp0,\zero,\poly +.endm + +/* + aadhash=aadhash*[hashkey,hashkey_ext] + rbit(dat) +*/ +.macro ghash_block_reg aadhash:req, dat:req, \ + hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \ + tmp0:req + pmull v\middle\().1q,v\aadhash\().1d,v\hashkey_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey_ext\().2d + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + pmull v\low\().1q,v\aadhash\().1d,v\hashkey\().1d + rbit v\aadhash\().16b, v\dat\().16b + ghash_mult_final_round \aadhash,\high,\low,\middle,\tmp0,\zero,\poly +.endm + +.macro ghash_mult_round_noload aadhash:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req, \ + tmp0:req,tmp1:req + + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + eor v\middle\().16b,v\middle\().16b,v\tmp1\().16b + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + +.endm + +/* aadhash=reduction([low,high],poly)+dat0 */ +.macro poly_mult_final_x2 aadhash:req, \ + high:req,low:req,tmp0:req,tmp1:req, \ + poly:req + pmull2 v\tmp1\().1q,v\high\().2d,v\poly\().2d + eor v\low\().16b, v\aadhash\().16b, v\low\().16b + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + ext v\tmp0\().16b,v\tmp1\().16b,v\aadhash\().16b,8 //high + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp1\().16b,8 //low + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + pmull v\tmp1\().1q,v\high\().1d,v\poly\().1d + eor v\aadhash\().16b, v\low\().16b, v\tmp1\().16b +.endm + +.macro aes_encrypt_round block,key + aese v\block\().16b,v\key\().16b + aesmc v\block\().16b,v\block\().16b +.endm + +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg + d\name .req d\reg +.endm + +.macro declare_var_generic_reg name:req,reg:req + \name .req x\reg + x\name .req x\reg + w\name .req w\reg +.endm + +/*Read data less than 16 */ +.macro read_small_data dest:req,src:req,size:req,tbl_adr:req,tbl:req + ldr q\tbl,[\tbl_adr,\size,lsl 4] + tbz \size,3,1f + ld1 {v\dest\().d}[0],[\src],8 +1: + tbz \size,2,1f + ld1 {v\dest\().s}[2],[\src],4 +1: + tbz \size,1,1f + ld1 {v\dest\().h}[6],[\src],2 +1: + tbz \size,0,1f + ld1 {v\dest\().b}[14],[\src],1 +1: + tbl v\dest\().16b,{v\dest\().16b},v\tbl\().16b +.endm +.macro read_small_data_start dest:req,src:req,size:req,tbl_adr:req,tbl:req + adrp \tbl_adr,:got:read_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:read_small_data_table] + read_small_data \dest,\src,\size,\tbl_adr,\tbl +.endm + +.macro read_small_data_end dest:req,src:req,size:req,tbl_adr:req,tbl:req + adrp \tbl_adr,:got:read_end_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:read_end_small_data_table] + read_small_data \dest,\src,\size,\tbl_adr,\tbl +.endm + +.macro write_small_data src:req,dest:req,size:req,tbl_adr:req,tmp1:req + ldr q\tmp1,[\tbl_adr,\size,lsl 4] + tbl v\tmp1\().16b,{v\src\().16b},v\tmp1\().16b + tbz \size,3,1f + st1 {v\tmp1\().d}[0],[\dest],8 +1: + tbz \size,2,1f + st1 {v\tmp1\().s}[2],[\dest],4 +1: + tbz \size,1,1f + st1 {v\tmp1\().h}[6],[\dest],2 +1: + tbz \size,0,1f + st1 {v\tmp1\().b}[14],[\dest],1 +1: +.endm +.macro write_small_data_start src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:write_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:write_small_data_table] + write_small_data \src,\dest,\size,\tbl_adr,\tmp1 +.endm +.macro write_small_data_end src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:write_end_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:write_end_small_data_table] + write_small_data \src,\dest,\size,\tbl_adr,\tmp1 +.endm + +.macro tbx_small_data_end src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:tbx_end_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:tbx_end_small_data_table] + ldr q\tmp1,[\tbl_adr,\size,lsl 4] + tbx v\dest\().16b,{v\src\().16b},v\tmp1\().16b +.endm + +.macro tbx_small_data_start src:req,dest:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:tbx_start_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:tbx_start_small_data_table] + ldr q\tmp1,[\tbl_adr,\size,lsl 4] + tbx v\dest\().16b,{v\src\().16b},v\tmp1\().16b +.endm + + +.macro clear_small_data dest:req,zero:req,size:req,tbl_adr:req,tmp1:req + adrp \tbl_adr,:got:shift_small_data_table + ldr \tbl_adr,[\tbl_adr,#:got_lo12:shift_small_data_table] + add \tbl_adr,\tbl_adr,16 + sub \tbl_adr,\tbl_adr,\size + ldr q\tmp1,[\tbl_adr] + tbx v\dest\().16b,{v\zero\().16b},v\tmp1\().16b +.endm + + +.macro aes_gcm_n_round is_enc:req,count:req,aadhash:req, dat_addr:req, \ + hashkey_addr:req, hashkey_base:req, \ + hashkey:req,hashkey_ext:req,high:req,low:req, poly:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req + + ghash_reset_hashkey_addr \hashkey_addr,\hashkey_base,\count + + aes_gcm_init \is_enc,\aadhash,\dat_addr,\hashkey_addr, \ + \hashkey,\hashkey_ext, \high,\low, \ + \ctr,\enc_ctr,\one,\out_adr, \ + \tmp0,\tmp1,\count + + .set left_count,\count - 1 + .rept left_count + aes_gcm_middle \is_enc,\aadhash,\dat_addr,\hashkey_addr, \ + \hashkey,\hashkey_ext, \high,\low, \ + \ctr,\enc_ctr,\one,\out_adr, \ + \tmp0,\tmp1, left_count + .set left_count,left_count - 1 + .endr + + poly_mult_final_x2 \aadhash,\high,\low,\tmp0,\tmp1,\poly + +.endm + + +/* + aadhash=aadhash*[hashkey_base[(TOTAL_HASHKEY_NUM-2),(TOTAL_HASHKEY_NUM-1)]] + rbit(dat) +*/ +.macro ghash_block_reg_x2 aadhash:req, dat:req, hashkey_base:req, \ + hashkey:req,high:req,low:req,tmp0:req, tmp1:req, \ + tmp2:req,temp0:req + ldr q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32+16] + eor v\tmp2\().16b,v\tmp2\().16b,v\tmp2\().16b,8 //zero + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey\().2d + ldr q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32] + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + ext v\tmp0\().16b,v\tmp0\().16b,v\tmp2\().16b,8 /*high*/ + ext v\tmp1\().16b,v\tmp2\().16b,v\tmp0\().16b,8 /*low*/ + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey\().2d + mov temp0,0x87 + pmull v\low\().1q,v\aadhash\().1d,v\hashkey\().1d + dup v\tmp2\().2d,x0 + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + rbit v\aadhash\().16b, v\dat\().16b + poly_mult_final_x2 \aadhash,\high,\low,\tmp0,\tmp1,\tmp2 +.endm + +.macro __generic_load_small_data is_enc:req,len_bit:req,small_read_len:req, \ + in_adr:req,out_adr:req,partial_block:req,temp0:req,temp1:req,r:req,p + tbz \small_read_len,\len_bit,1f + ldr\p \r\()\temp0,[\in_adr],1<<\len_bit /*in */ + ldr\p \r\()\temp1,[\partial_block] /* partial*/ + eor \r\()\temp1,\r\()\temp0,\r\()\temp1 + .ifc \is_enc ,decrypt + str\p \r\()\temp0,[\partial_block],1<<\len_bit + .endif + .ifc \is_enc, encrypt + str\p \r\()\temp1,[\partial_block],1<<\len_bit + .endif + str\p \r\()\temp1,[\out_adr],1<<\len_bit +1: +.endm +.macro generic_load_partial_block is_enc:req,small_read_len:req,in_adr:req,out_adr:req, \ + partial_block:req,temp0:req,temp1:req + __generic_load_small_data \is_enc,3,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,x /* small_read_len >=8 */ + __generic_load_small_data \is_enc,2,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w /* small_read_len >=4 */ + __generic_load_small_data \is_enc,1,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,h /* small_read_len >=2 */ + __generic_load_small_data \is_enc,0,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,b /* small_read_len >=1 */ +.endm +/* without Neon read version */ +.macro generic_partial_block_start is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req, \ + partial_block:req,partial_block_len:req,small_read_len:req,left_partial_block_len:req, \ + temp0:req + mov \left_partial_block_len,16 + add \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF + sub \left_partial_block_len,\left_partial_block_len,\partial_block_len + add \partial_block,\partial_block,\partial_block_len + cmp \in_len,\left_partial_block_len + csel \small_read_len,\in_len,\left_partial_block_len, ls + add \partial_block_len,\partial_block_len,\small_read_len + sub \in_len,\in_len,\small_read_len + and \partial_block_len,\partial_block_len,0xf + str \partial_block_len,[\context,PARTIAL_BLOCK_LENGTH_OFF] + generic_load_partial_block \is_enc,\small_read_len,\in_adr,\out_adr,\partial_block, \ + \left_partial_block_len,\temp0 /* small_read_len >=8 */ +.endm +.macro generic_paritial_block_end is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req, \ + partial_block:req,temp0:req,temp1:req + str \in_len,[\context,PARTIAL_BLOCK_LENGTH_OFF] + add \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF + generic_load_partial_block \is_enc,\in_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1 /* small_read_len >=8 */ +.endm +/*partial_block_len+in_len < 16,partial_block_len=0,in_len>0 */ +.macro paritial_block_small_length is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req,temp0:req,temp1:req,Ctr:req + + cbz 1f + ldr \temp0,[\context,PARTIAL_BLOCK_LENGTH_OFF] + add \temp1,\temp0,\in_len + str \temp1,[\context,PARTIAL_BLOCK_LENGTH_OFF] + add \context,\temp0,PARTIAL_BLOCK_ENC_KEY_OFF +2:/* loop start */ + sub \in_len,\in_len,1 + ldrb w\temp0,[\in_adr],1 + ldrb w\temp1,[\context] + eor w\temp1,w\temp1,w\temp0 + strb w\temp1,[\out_adr],1 +.ifc \is_enc , encrypt + strb w\temp1,[\context],1 +.endif +.ifc \is_enc,decrypt + strb w\temp0,[\context],1 +.endif + cbnz \in_len,2b +1:/* loop end */ +.endm + +/* 0<in_len < 16,partial_block_len=0 */ +.macro paritial_block_end is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req, \ + temp0:req,partial_block_len:req \ + PartialBlock:req,ctr:req,one:req,Tmp2:req,Tmp3:req,Tmp4:req + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + str q\ctr,[context,CTR_OFF] + read_small_data_start \PartialBlock,\in_adr,\in_len,\tbl_adr,\Tmp0 + aes_encrypt_block \ctr + +.endm + declare_var_vector_reg Key0 ,16 + declare_var_vector_reg Key1 ,17 + declare_var_vector_reg Key2 ,18 + declare_var_vector_reg Key3 ,19 + declare_var_vector_reg Key4 ,20 + declare_var_vector_reg Key5 ,21 + declare_var_vector_reg Key6 ,22 + declare_var_vector_reg Key7 ,23 + declare_var_vector_reg Key8 ,24 + declare_var_vector_reg Key9 ,25 + declare_var_vector_reg Key10,26 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S new file mode 100644 index 000000000..02add91a2 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S @@ -0,0 +1,165 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define KEY_LEN 128 +#include "gcm_common.S" + +#define KEY_REGS 0,1,2,3,4,5,6,7,8 +.macro aes_encrypt_block block:req + aes_encrypt_round \block,Key0 + aes_encrypt_round \block,Key1 + aes_encrypt_round \block,Key2 + aes_encrypt_round \block,Key3 + aes_encrypt_round \block,Key4 + aes_encrypt_round \block,Key5 + aes_encrypt_round \block,Key6 + aes_encrypt_round \block,Key7 + aes_encrypt_round \block,Key8 + aese v\block\().16b,vKey9.16b + eor v\block\().16b,v\block\().16b,vKey10.16b +.endm + +/* + Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1] + */ +.macro load_aes_keys key_addr:req + ld1 {vKey0.4s- vKey3.4s},[\key_addr],64 + ld1 {vKey4.4s- vKey7.4s},[\key_addr],64 + ldp qKey8,qKey9,[\key_addr],32 + ldr qKey10,[\key_addr],15*16 - 128 - 32 +.endm + + + +/* + [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext] + dat=*dat_adr + enc_dat=aes_encrypt(ctr)^dat + aadhash=rbit(enc_dat) + [hashkey0,hashkey0_ext] = *hashkey_adr + dat_adr+=16 + hashkey_adr+=32 +*/ +.macro aes_gcm_middle is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + aes_encrypt_round \enc_ctr,Key1 + aes_encrypt_round \enc_ctr,Key2 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key3 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 + aes_encrypt_round \enc_ctr,Key4 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key5 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key6 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aese v\enc_ctr\().16b,vKey9.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc, encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + +.macro aes_gcm_init is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req, high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + ldr q\hashkey0,[\hashkey_adr],16 + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + ldr q\hashkey0_ext,[\hashkey_adr],16 + aes_encrypt_round \enc_ctr,Key1 + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d + + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key2 + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key3 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + + aes_encrypt_round \enc_ctr,Key4 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 //low + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 //high + aes_encrypt_round \enc_ctr,Key5 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key6 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aese v\enc_ctr\().16b,vKey9.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc , encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S new file mode 100644 index 000000000..fb6a6e94d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S @@ -0,0 +1,181 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define KEY_LEN 256 +#include "gcm_common.S" + declare_var_vector_reg Key11,27 + declare_var_vector_reg Key12,28 + declare_var_vector_reg Key13,29 + declare_var_vector_reg Key14,30 +#define KEY_REGS 0,1,2,3,4,5,6,7,8,9,10,11,12 +.macro aes_encrypt_block block:req + aes_encrypt_round \block,Key0 + aes_encrypt_round \block,Key1 + aes_encrypt_round \block,Key2 + aes_encrypt_round \block,Key3 + aes_encrypt_round \block,Key4 + aes_encrypt_round \block,Key5 + aes_encrypt_round \block,Key6 + aes_encrypt_round \block,Key7 + aes_encrypt_round \block,Key8 + aes_encrypt_round \block,Key9 + aes_encrypt_round \block,Key10 + aes_encrypt_round \block,Key11 + aes_encrypt_round \block,Key12 + aese v\block\().16b,vKey13.16b + eor v\block\().16b,v\block\().16b,vKey14.16b +.endm + +/* + Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1] + */ +.macro load_aes_keys key_addr:req + ld1 { vKey0.4s- vKey3.4s},[\key_addr],64 + ld1 { vKey4.4s- vKey7.4s},[\key_addr],64 + ld1 { vKey8.4s- vKey11.4s},[\key_addr],64 + ld1 {vKey12.4s- vKey14.4s},[\key_addr],48 +.endm + + + +/* + [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext] + dat=*dat_adr + enc_dat=aes_encrypt(ctr)^dat + aadhash=rbit(enc_dat) + [hashkey0,hashkey0_ext] = *hashkey_adr + dat_adr+=16 + hashkey_adr+=32 +*/ +.macro aes_gcm_middle is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req,high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + + add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr + + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + aes_encrypt_round \enc_ctr,Key1 + aes_encrypt_round \enc_ctr,Key2 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key3 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 + aes_encrypt_round \enc_ctr,Key4 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key5 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key6 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aes_encrypt_round \enc_ctr,Key9 + aes_encrypt_round \enc_ctr,Key10 + aes_encrypt_round \enc_ctr,Key11 + aes_encrypt_round \enc_ctr,Key12 + aese v\enc_ctr\().16b,vKey13.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc , encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + +.macro aes_gcm_init is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \ + hashkey0:req,hashkey0_ext:req, high:req,low:req, \ + ctr:req,enc_ctr:req,one:req,out_adr:req, \ + tmp0:req,tmp1:req,left_count:req + ldr q\hashkey0,[\hashkey_adr],16 + add v\ctr\().4s,v\ctr\().4s,v\one\().4s /*increase ctr */ + rev32 v\enc_ctr\().16b,v\ctr\().16b + aes_encrypt_round \enc_ctr,Key0 + ldr q\hashkey0_ext,[\hashkey_adr],16 + aes_encrypt_round \enc_ctr,Key1 + pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d + pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d + + .if \left_count > 1 + ldr q\hashkey0,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key2 + pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d + pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d + eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b + + .if \left_count > 1 + ldr q\hashkey0_ext,[\hashkey_adr],16 + .endif + aes_encrypt_round \enc_ctr,Key3 + eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b + + aes_encrypt_round \enc_ctr,Key4 + ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 /*low */ + ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 /* high */ + aes_encrypt_round \enc_ctr,Key5 + eor v\low\().16b,v\low\().16b,v\tmp1\().16b + eor v\high\().16b,v\high\().16b,v\tmp0\().16b + aes_encrypt_round \enc_ctr,Key6 + ldr q\aadhash,[\dat_adr],16 + aes_encrypt_round \enc_ctr,Key7 + aes_encrypt_round \enc_ctr,Key8 + aes_encrypt_round \enc_ctr,Key9 + aes_encrypt_round \enc_ctr,Key10 + aes_encrypt_round \enc_ctr,Key11 + aes_encrypt_round \enc_ctr,Key12 + aese v\enc_ctr\().16b,vKey13.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b + eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b + .ifc \is_enc , encrypt + rbit v\aadhash\().16b,v\enc_ctr\().16b + .endif + .ifc \is_enc , decrypt + rbit v\aadhash\().16b,v\aadhash\().16b + .endif + str q\enc_ctr,[\out_adr],16 +.endm + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S new file mode 100644 index 000000000..927179cfc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S @@ -0,0 +1,588 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +/* +void gist_aes_gcm_dec_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *out, \ + uint8_t const *in, \ + uint64_t len, \ + uint8_t *iv, \ + \ + uint8_t const *aad, \ + uint64_t aad_len, \ + uint8_t *auth_tag, \ + uint64_t auth_tag_len \ + \ + ) + */ + + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg out ,2 + declare_var_generic_reg in ,3 + declare_var_generic_reg len ,4 + declare_var_generic_reg iv ,5 + declare_var_generic_reg aad ,6 + declare_var_generic_reg aad_len ,7 + + declare_var_generic_reg hashkey_base,0 + declare_var_generic_reg hashkey_addr,5 + declare_var_generic_reg left_len ,12 + declare_var_generic_reg aad_left ,13 + declare_var_generic_reg temp0 ,14 + declare_var_generic_reg temp1 ,15 + + declare_var_generic_reg auth_tag ,0 /* input param */ + declare_var_generic_reg auth_tag_len,1 /* input param */ + + + declare_var_vector_reg Ctr,0 + declare_var_vector_reg AadHash,1 + declare_var_vector_reg HashKey0,2 + declare_var_vector_reg HashKey0Ext,3 + declare_var_vector_reg High,4 + declare_var_vector_reg Low,5 + declare_var_vector_reg EncCtr,6 + declare_var_vector_reg Dat0,6 + declare_var_vector_reg Middle0,7 + + declare_var_vector_reg Tmp0,8 + declare_var_vector_reg Tmp1,9 + declare_var_vector_reg Zero,10 + declare_var_vector_reg Poly,11 + declare_var_vector_reg LeftDat ,12 + declare_var_vector_reg Len ,13 + declare_var_vector_reg Tmp2,14 + declare_var_vector_reg Tmp3,15 + + declare_var_vector_reg One,31 + .set stack_size,64 + .macro push_stack + stp d8, d9,[sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + stp d14,d15,[sp,48] + + .endm + + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d14,d15,[sp,48] + ldp d8, d9, [sp], stack_size + .endm + +START_FUNC(enc,KEY_LEN,_) +START_FUNC(enc,KEY_LEN,_nt_) + push_stack + /*save in_length and aad_length*/ + stp aad_len,len,[context,AAD_LEN_OFF] + load_aes_keys key_data + /* Init Consts and IV */ + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + ld1 {vCtr.d}[0],[iv],8 + eor vZero.16b,vZero.16b,vZero.16b + ld1 {vCtr.s}[2],[iv] + mov temp0,0x87 + rev32 vCtr.16b,vCtr.16b /* to cpu order */ + ins vOne.s[3],wtemp1 + mov vAadHash.16b,vZero.16b + dup vPoly.2d,temp0 + ins vCtr.s[3],wtemp1 /* Initial Ctr and Orig IV */ + + + and left_len,aad_len,0xf + cbz aad_len,24f + lsr aad_len,aad_len,4 + /* Read small data */ + cbz left_len,2f /* aad_len >= 16,skip */ + add aad_left,aad,aad_len,lsl 4 + read_small_data_start LeftDat,aad_left,left_len,temp0,Tmp0 + cbnz left_len,1f /* aad_len & 0xf != 0 */ +2: + cbz aad_len,1f /* aad_len <16 skip*/ + /* left_len == 0 && aad_len !=0 */ + sub aad_len,aad_len,1 + /* leftDat = aad[-1] */ + ldr qLeftDat,[aad,aad_len,lsl 4] +1: + cbnz aad_len,1f /* aad_len >16,skip */ + rbit vAadHash.16b,vLeftDat.16b + b 24f /* aad_len <=16, skip aadhash caculate */ +1: + /* aad_len > 16 */ + ldr qAadHash,[aad],16 + rbit vAadHash.16b,vAadHash.16b + sub aad_len,aad_len,1 + +1: + /* loop ghash_block */ + cmp aad_len,HASHKEY_TOTAL_NUM - 1 + bls 1f // break loop + sub aad_len,aad_len,HASHKEY_TOTAL_NUM + ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbnz aad_len,1f /* left aad_len >32,skip */ + ldp qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ghash_block_reg AadHash,LeftDat, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0 + b 24f /* left aad_len <=32,skip below check */ +1: + mov temp0,HASHKEY_TOTAL_NUM - 1 + sub temp0,temp0,aad_len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */ + sub aad_len,aad_len,1 + +1: + cbz aad_len,1f + ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Tmp1,Dat0, 2 + + sub aad_len,aad_len,1 + b 1b +1: + ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1 + rbit vAadHash.16b, vLeftDat.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + +24: + + /* Enc/Dec loop */ + and left_len,len,15 + cbz len,24f + lsr len,len,4 +1: + /* loop aes gcm enc/dec loop */ + cmp len,HASHKEY_TOTAL_NUM - 1 + bls 1f // break loop + sub len,len,HASHKEY_TOTAL_NUM + aes_gcm_n_round encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbz len,24f /* left len == 0 */ + mov temp0,HASHKEY_TOTAL_NUM + sub temp0,temp0,len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + sub len,len,1 + aes_gcm_init encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbz len,2f + sub len,len,1 +1: + + cbz len,1f + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + sub len,len,1 + b 1b +1: + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* load next hash */ +2: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +24: + /* complete part */ + cmp left_len,0 + movi vHigh.16b,0 + mov temp0,HASHKEY_TOTAL_NUM-3 + movi vLow.16b,0 + cinc hashkey_addr,temp0,eq + movi vMiddle0.16b,0 + add hashkey_addr,hashkey_base,hashkey_addr,lsl 5 + ldp qHashKey0,qHashKey0Ext,[hashkey_addr],32 + beq 2f + read_small_data_start LeftDat,in,left_len,temp0,Tmp0 + add vCtr.4s,vCtr.4s,vOne.4s + rev32 vEncCtr.16b,vCtr.16b + aes_encrypt_round EncCtr,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round EncCtr,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round EncCtr,Key2 + ldr qHashKey0,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key3 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round EncCtr,Key4 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round EncCtr,Key5 + ldr qHashKey0Ext,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key6 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round EncCtr,Key7 + aes_encrypt_round EncCtr,Key8 +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 + aes_encrypt_round EncCtr,Key10 + aes_encrypt_round EncCtr,Key11 + aes_encrypt_round EncCtr,Key12 + aese vEncCtr.16b,vKey13.16b + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b +#else + aese vEncCtr.16b,vKey9.16b + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b +#endif + eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b + write_small_data_start EncCtr,out,left_len,temp0,Tmp0 + clear_small_data EncCtr,Zero,left_len,temp0,Tmp0 + rbit vAadHash.16b,vEncCtr.16b +2: + + ldr qLen,[context,AAD_LEN_OFF] /* Len */ + mov wtemp0,1 /* Ek */ + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d /* auth_dat * HashKey[Total-2] */ + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* auth_dat * HashKey[Total-2] */ + rev64 vLen.16b,vLen.16b /* Len */ + ins vCtr.4s[3],wtemp0 /* Ek */ + ldr qHashKey0,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* auth_dat * HashKey[Total-2] */ + rev32 vEncCtr.16b,vCtr.16b /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* auth_dat * HashKey[Total-2] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* auth_dat * HashKey[Total-2] */ + rbit vAadHash.16b,vLen.16b /* Len */ + + aes_encrypt_round EncCtr,Key0 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key1 /* Ek */ + ldr qHashKey0Ext,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key2 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key3 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key4 /* Ek */ + + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d /* Len * HashKey[Total-1] */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key5 /* Ek */ + aes_encrypt_round EncCtr,Key6 /* Ek */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key7 /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* Len * HashKey[Total-1] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key8 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* Len * HashKey[Total-1] */ +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 /* Ek */ + aes_encrypt_round EncCtr,Key10 /* Ek */ + aes_encrypt_round EncCtr,Key11 /* Ek */ + aes_encrypt_round EncCtr,Key12 /* Ek */ + aese vEncCtr.16b,vKey13.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b /* Ek */ +#else + aese vEncCtr.16b,vKey9.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b /* Ek */ +#endif + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* Len * HashKey[Total-1] */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* Len * HashKey[Total-1] */ + rbit vAadHash.16b,vEncCtr.16b /* Aad */ + + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + ldp auth_tag,auth_tag_len,[sp,stack_size] /* Adjust here : TODO TBD */ + rbit vAadHash.16b,vAadHash.16b /* Aad */ + + + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret +END_FUNC(enc,KEY_LEN,_) +END_FUNC(enc,KEY_LEN,_nt_) + + +START_FUNC(dec,KEY_LEN,_) +START_FUNC(dec,KEY_LEN,_nt_) + push_stack + /* save in_length and aad_length */ + stp aad_len,len,[context,AAD_LEN_OFF] + load_aes_keys key_data + /* Init Consts and IV */ + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + ld1 {vCtr.d}[0],[iv],8 + eor vZero.16b,vZero.16b,vZero.16b + ld1 {vCtr.s}[2],[iv] + mov temp0,0x87 + rev32 vCtr.16b,vCtr.16b /* to cpu order */ + mov vAadHash.16b,vZero.16b + ins vOne.s[3],wtemp1 + dup vPoly.2d,temp0 + ins vCtr.s[3],wtemp1 /* Initial Ctr and Orig IV */ + + ldp qHashKey0,qHashKey0Ext,[hashkey_base] + and left_len,aad_len,0xf + cbz aad_len,24f + lsr aad_len,aad_len,4 + /* Read small data */ + cbz left_len,2f /* aad_len >= 16,skip */ + add aad_left,aad,aad_len,lsl 4 + read_small_data_start LeftDat,aad_left,left_len,temp0,Tmp0 + cbnz left_len,1f /* aad_len & 0xf != 0 */ +2: + cbz aad_len,1f /* aad_len <16 skip */ + /* left_len == 0 && aad_len !=0 */ + sub aad_len,aad_len,1 + /* leftDat = aad[-1] */ + ldr qLeftDat,[aad,aad_len,lsl 4] +1: + cbnz aad_len,1f /* aad_len >16,skip */ + rbit vAadHash.16b,vLeftDat.16b + b 24f /* aad_len <=16, skip aadhash caculate */ +1: + /* aad_len > 16 */ + ldr qAadHash,[aad],16 + rbit vAadHash.16b,vAadHash.16b + sub aad_len,aad_len,1 + +1: + /** loop ghash_block */ + cmp aad_len,HASHKEY_TOTAL_NUM - 1 + bls 1f /* break loop */ + sub aad_len,aad_len,HASHKEY_TOTAL_NUM + ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbnz aad_len,1f /* left aad_len >32,skip */ + ldp qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ghash_block_reg AadHash,LeftDat, \ + HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \ + Tmp0 + b 24f /* left aad_len <=32,skip below check */ +1: + mov temp0,HASHKEY_TOTAL_NUM - 1 + sub temp0,temp0,aad_len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */ + sub aad_len,aad_len,1 + +1: + cbz aad_len,1f + ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Middle0,Tmp0,Tmp1,Dat0, 2 + + sub aad_len,aad_len,1 + b 1b +1: + ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1 + rbit vAadHash.16b, vLeftDat.16b + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + +24: + + + /* Enc/Dec loop */ + and left_len,len,15 + cbz len,24f + lsr len,len,4 +1: + /* loop aes gcm enc/dec loop */ + cmp len,HASHKEY_TOTAL_NUM - 1 + bls 1f // break loop + sub len,len,HASHKEY_TOTAL_NUM + aes_gcm_n_round decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + b 1b /* back to loop start */ +1: + cbz len,24f /* left len == 0 */ + mov temp0,HASHKEY_TOTAL_NUM + sub temp0,temp0,len + add hashkey_addr,hashkey_base,temp0,lsl 5 + + sub len,len,1 + aes_gcm_init decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbz len,2f + sub len,len,1 +1: + + cbz len,1f + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + sub len,len,1 + b 1b +1: + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* load next hash */ +2: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +24: + /* complete part */ + cmp left_len,0 + movi vHigh.16b,0 + mov temp0,21 + movi vLow.16b,0 + cinc hashkey_addr,temp0,eq + movi vMiddle0.16b,0 + add hashkey_addr,hashkey_base,hashkey_addr,lsl 5 + ldp qHashKey0,qHashKey0Ext,[hashkey_addr],32 + beq 2f + read_small_data_start LeftDat,in,left_len,temp0,Tmp0 + add vCtr.4s,vCtr.4s,vOne.4s + rev32 vEncCtr.16b,vCtr.16b + aes_encrypt_round EncCtr,Key0 + pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d + aes_encrypt_round EncCtr,Key1 + pmull vLow.1q ,vAadHash.1d,vHashKey0.1d + aes_encrypt_round EncCtr,Key2 + ldr qHashKey0,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key3 + pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d + aes_encrypt_round EncCtr,Key4 + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d + aes_encrypt_round EncCtr,Key5 + ldr qHashKey0Ext,[hashkey_addr],16 + aes_encrypt_round EncCtr,Key6 + eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b + aes_encrypt_round EncCtr,Key7 + aes_encrypt_round EncCtr,Key8 +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 + aes_encrypt_round EncCtr,Key10 + aes_encrypt_round EncCtr,Key11 + aes_encrypt_round EncCtr,Key12 + aese vEncCtr.16b,vKey13.16b + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b + eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b +#endif +#if KEY_LEN==128 + aese vEncCtr.16b,vKey9.16b + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b + eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b +#endif + write_small_data_start EncCtr,out,left_len,temp0,Tmp0 + rbit vAadHash.16b,vLeftDat.16b + +2: + + ldr qLen,[context,AAD_LEN_OFF] /* Len */ + mov wtemp0,1 /* Ek */ + pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d /* auth_dat * HashKey[Total-2] */ + shl vLen.2d,vLen.2d,3 /* Len */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* auth_dat * HashKey[Total-2] */ + rev64 vLen.16b,vLen.16b /* Len */ + ins vCtr.4s[3],wtemp0 /* Ek */ + ldr qHashKey0,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* auth_dat * HashKey[Total-2] */ + rev32 vEncCtr.16b,vCtr.16b /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* auth_dat * HashKey[Total-2] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* auth_dat * HashKey[Total-2] */ + rbit vAadHash.16b,vLen.16b /* Len */ + + aes_encrypt_round EncCtr,Key0 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key1 /* Ek */ + ldr qHashKey0Ext,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key2 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key3 /* Ek */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* auth_dat * HashKey[Total-2] */ + aes_encrypt_round EncCtr,Key4 /* Ek */ + + pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d /* Len * HashKey[Total-1] */ + pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key5 /* Ek */ + aes_encrypt_round EncCtr,Key6 /* Ek */ + pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key7 /* Ek */ + eor vHigh.16b,vHigh.16b,vTmp0.16b /* Len * HashKey[Total-1] */ + pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* Len * HashKey[Total-1] */ + aes_encrypt_round EncCtr,Key8 /* Ek */ + eor vLow.16b,vLow.16b,vTmp1.16b /* Len * HashKey[Total-1] */ +#if KEY_LEN==256 + aes_encrypt_round EncCtr,Key9 /* Ek */ + aes_encrypt_round EncCtr,Key10 /* Ek */ + aes_encrypt_round EncCtr,Key11 /* Ek */ + aes_encrypt_round EncCtr,Key12 /* Ek */ + aese vEncCtr.16b,vKey13.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey14.16b /* Ek */ +#else + aese vEncCtr.16b,vKey9.16b /* Ek */ + eor vEncCtr.16b,vEncCtr.16b,vKey10.16b /* Ek */ +#endif + eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* Len * HashKey[Total-1] */ + eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* Len * HashKey[Total-1] */ + rbit vAadHash.16b,vEncCtr.16b /* Aad */ + + ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly + + ldp auth_tag,auth_tag_len,[sp,stack_size] /* Adjust here : TODO TBD */ + rbit vAadHash.16b,vAadHash.16b /* Aad */ + + + /* output auth_tag */ + cmp auth_tag_len,16 + bne 1f + /* most likely auth_tag_len=16 */ + str qAadHash,[auth_tag] + pop_stack + ret +1: /* auth_tag_len=12 */ + cmp auth_tag_len,12 + bne 1f + str dAadHash,[auth_tag],8 + st1 {vAadHash.s}[2],[auth_tag] + pop_stack + ret +1: /* auth_tag_len=8 */ + str dAadHash,[auth_tag] + pop_stack + ret +END_FUNC(dec,KEY_LEN,_) +END_FUNC(dec,KEY_LEN,_nt_) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S new file mode 100644 index 000000000..b5433a1df --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S @@ -0,0 +1,58 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface aes_gcm_enc_128 +mbin_interface aes_gcm_dec_128 +mbin_interface aes_gcm_precomp_128 +mbin_interface aes_gcm_enc_256 +mbin_interface aes_gcm_dec_256 +mbin_interface aes_gcm_precomp_256 + + +mbin_interface aes_gcm_enc_128_update +mbin_interface aes_gcm_enc_128_finalize +mbin_interface aes_gcm_dec_128_update +mbin_interface aes_gcm_dec_128_finalize +mbin_interface aes_gcm_enc_256_update +mbin_interface aes_gcm_enc_256_finalize +mbin_interface aes_gcm_dec_256_update +mbin_interface aes_gcm_dec_256_finalize + +mbin_interface aes_gcm_init_256 +mbin_interface aes_gcm_init_128 +mbin_interface aes_gcm_enc_128_nt +mbin_interface aes_gcm_enc_128_update_nt +mbin_interface aes_gcm_dec_128_nt +mbin_interface aes_gcm_dec_128_update_nt +mbin_interface aes_gcm_enc_256_nt +mbin_interface aes_gcm_enc_256_update_nt +mbin_interface aes_gcm_dec_256_nt +mbin_interface aes_gcm_dec_256_update_nt diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S new file mode 100644 index 000000000..e555c9798 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S @@ -0,0 +1,83 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +/* +void aes_gcm_precomp(struct gcm_key_data *key_data); +*/ + declare_var_generic_reg key_data ,0 + declare_var_generic_reg temp0 ,1 + declare_var_generic_reg hashkey_base,0 + declare_var_generic_reg hashkey_addr,1 + + declare_var_vector_reg Low ,0 + declare_var_vector_reg Middle0 ,1 + declare_var_vector_reg Middle1 ,2 + declare_var_vector_reg High ,3 + declare_var_vector_reg HashKeyIter ,4 + declare_var_vector_reg HashKey ,5 + declare_var_vector_reg HashKeyExt ,6 + declare_var_vector_reg Poly ,7 + declare_var_vector_reg Zero ,31 + +START_FUNC(precomp,KEY_LEN,_) + load_aes_keys key_data + mov temp0,0x87 + eor vZero.16b,vZero.16b,vZero.16b + eor vHashKey.16b,vHashKey.16b,vHashKey.16b + dup vPoly.2d,temp0 + aes_encrypt_block HashKey + add hashkey_addr,hashkey_base,(HASHKEY_TOTAL_NUM-1)*32 + rbit vHashKey.16b,vHashKey.16b + ext vHashKeyExt.16b,vHashKey.16b,vHashKey.16b,8 + mov vHashKeyIter.16b,vHashKey.16b + stp qHashKey,qHashKeyExt,[hashkey_addr],-32 + +1: + pmull vMiddle0.1q,vHashKeyIter.1d,vHashKeyExt.1d + pmull2 vMiddle1.1q,vHashKeyIter.2d,vHashKeyExt.2d + pmull vLow.1q ,vHashKeyIter.1d,vHashKey.1d + eor vMiddle0.16b,vMiddle0.16b,vMiddle1.16b + pmull2 vHigh.1q ,vHashKeyIter.2d,vHashKey.2d + ext vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high + ext vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low + eor vHigh.16b ,vHigh.16b,vMiddle1.16b + eor vLow.16b ,vLow.16b ,vMiddle0.16b + pmull2 vMiddle0.1q ,vHigh.2d ,vPoly.2d + ext vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high + ext vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low + eor vHigh.16b ,vHigh.16b,vMiddle1.16b + eor vLow.16b ,vLow.16b ,vMiddle0.16b + pmull vMiddle0.1q ,vHigh.1d ,vPoly.1d + eor vHashKeyIter.16b,vLow.16b,vMiddle0.16b + ext vLow.16b,vHashKeyIter.16b,vHashKeyIter.16b,8 + stp qHashKeyIter,qLow,[hashkey_addr],-32 + cmp hashkey_addr,hashkey_base + bcs 1b + + ret +END_FUNC(precomp,KEY_LEN,_) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S new file mode 100644 index 000000000..d47c52212 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S @@ -0,0 +1,277 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +/* +void gist_aes_gcm_dec_update_##mode( \ + const struct gcm_key_data *key_data, \ + struct gcm_context_data *context, \ + uint8_t *out, \ + const uint8_t *in, \ + uint64_t len \ + ) + */ + + declare_var_generic_reg key_data ,0 + declare_var_generic_reg context ,1 + declare_var_generic_reg out ,2 + declare_var_generic_reg in ,3 + declare_var_generic_reg len ,4 + declare_var_generic_reg partial_block_length,5 + declare_var_generic_reg blocks ,5 + declare_var_generic_reg hashkey_base,0 + declare_var_generic_reg hashkey_addr,6 + declare_var_generic_reg temp0 ,14 + declare_var_generic_reg temp1 ,15 + declare_var_generic_reg temp2 ,13 + + + + declare_var_vector_reg Ctr,0 + declare_var_vector_reg AadHash,1 + declare_var_vector_reg HashKey0,2 + declare_var_vector_reg HashKey0Ext,3 + declare_var_vector_reg High,4 + declare_var_vector_reg Low,5 + declare_var_vector_reg EncCtr,6 + declare_var_vector_reg Middle,7 + + declare_var_vector_reg Tmp0,8 + declare_var_vector_reg Tmp1,9 + declare_var_vector_reg Zero,10 + declare_var_vector_reg Poly,11 + declare_var_vector_reg PartialBlock ,12 + declare_var_vector_reg One,31 + .set stack_size,48 + .macro push_stack + stp d8, d9, [sp,-stack_size]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + + .endm + + .macro pop_stack + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8, d9, [sp], stack_size + .endm +/* + 20:exit_without_popstack + 21:start_of_mainloop + 22:exit_with_popstack + 23:partial_block_start + */ +START_FUNC(enc,KEY_LEN,_update_) +START_FUNC(enc,KEY_LEN,_update_nt_) + ldr temp0,[context,IN_LENGTH_OFF] /*load in_length */ + ldr partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF] + ldr qAadHash,[context] + cbz len,20f /** if(len==0)return; exit_without_popstack*/ + push_stack + add temp0,temp0,len /* temp0=temp0+len */ + load_aes_keys key_data + str temp0,[context,IN_LENGTH_OFF] /* save in_length */ + /* Init Consts and IV */ + ldr qCtr,[context,CTR_OFF] + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + mov temp0,0x87 + eor vZero.16b,vZero.16b,vZero.16b + ins vOne.s[3],wtemp1 + dup vPoly.2d,temp0 + cbnz partial_block_length,23f /* if(partial_block_length!=0) not normal case*/ +21: /* start_of_mainloop */ + cbz len,24f + lsr blocks,len,4 + cmp blocks,HASHKEY_TOTAL_NUM - 1 + and len,len,0xf + /* loop aes gcm enc/dec loop */ + bls 2f /* skip loop */ +1: + sub blocks,blocks,HASHKEY_TOTAL_NUM + cmp blocks,HASHKEY_TOTAL_NUM - 1 + aes_gcm_n_round encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + bhi 1b /* back to loop start */ +2: + cbz blocks,4f // left blocks == 0 + /* -(blocks - HASHKEY_TOTAL_NUM) */ + sub temp0,blocks,HASHKEY_TOTAL_NUM + neg temp0,temp0 + sub blocks,blocks,1 + add hashkey_addr,hashkey_base,temp0,lsl 5 + + aes_gcm_init encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbz blocks,3f /* origin_blocks == 1 */ + sub blocks,blocks,1 + + cbz blocks,2f /* origin_blocks == 2 */ +1: + sub blocks,blocks,1 + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbnz blocks,1b +2: + aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* not load next hash */ +3: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +4: + str qAadHash,[context] + str qCtr,[context,CTR_OFF] + cbnz len,24f +22: /* exit_with_popstack */ + pop_stack +20: /* exit_without_popstack */ + ret +23: /* partial_block_start */ + + generic_partial_block_start encrypt,len,in,out,context, \ + temp2,partial_block_length,temp0,temp1,hashkey_addr + cbnz partial_block_length,22b + ldr qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16] + ldr qHashKey0 ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ldr qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \ + High,Low,Middle,Zero,Poly,Tmp0 + str qAadHash,[context] + cbz len,4b + cmp len,15 + bhi 21b +24: /*partial_block_end */ + add vCtr.4s,vCtr.4s,vOne.4s + read_small_data_start PartialBlock,in,len,temp0,Tmp0 + rev32 vEncCtr.16b,vCtr.16b + str qCtr,[context,CTR_OFF] + aes_encrypt_block EncCtr + eor vPartialBlock.16b,vPartialBlock.16b,vEncCtr.16b + str qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + write_small_data_start PartialBlock,out,len,temp0,Tmp0 + str len,[context,PARTIAL_BLOCK_LENGTH_OFF] + pop_stack + ret + +END_FUNC(enc,KEY_LEN,_update_) +END_FUNC(enc,KEY_LEN,_update_nt_) + + +START_FUNC(dec,KEY_LEN,_update_) +START_FUNC(dec,KEY_LEN,_update_nt_) + ldr temp0,[context,IN_LENGTH_OFF] /*load in_length */ + ldr partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF] + ldr qAadHash,[context] + cbz len,20f /** if(len==0)return; exit_without_popstack*/ + push_stack + add temp0,temp0,len /* temp0=temp0+len */ + load_aes_keys key_data + str temp0,[context,IN_LENGTH_OFF] /* save in_length */ + /* Init Consts and IV */ + ldr qCtr,[context,CTR_OFF] + mov wtemp1,1 + eor vOne.16b,vOne.16b,vOne.16b + mov temp0,0x87 + eor vZero.16b,vZero.16b,vZero.16b + ins vOne.s[3],wtemp1 + dup vPoly.2d,temp0 + cbnz partial_block_length,23f /* if(partial_block_length!=0) not normal case*/ +21: /* start_of_mainloop */ + cbz len,24f + lsr blocks,len,4 + cmp blocks,HASHKEY_TOTAL_NUM - 1 + and len,len,0xf + /** loop aes gcm enc/dec loop */ + bls 2f /* skip loop */ +1: + sub blocks,blocks,HASHKEY_TOTAL_NUM + cmp blocks,HASHKEY_TOTAL_NUM - 1 + aes_gcm_n_round decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \ + HashKey0,HashKey0Ext,High,Low,Poly, \ + Ctr,EncCtr,One,out,Tmp0,Tmp1 + bhi 1b /* back to loop start */ +2: + cbz blocks,4f /* left blocks == 0 */ + /* -(blocks - HASHKEY_TOTAL_NUM) */ + sub temp0,blocks,HASHKEY_TOTAL_NUM + neg temp0,temp0 + sub blocks,blocks,1 + add hashkey_addr,hashkey_base,temp0,lsl 5 + + aes_gcm_init decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 // load next hash + cbz blocks,3f /* origin_blocks == 1 */ + sub blocks,blocks,1 + + cbz blocks,2f /* origin_blocks == 2 */ +1: + sub blocks,blocks,1 + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */ + cbnz blocks,1b +2: + aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \ + High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* not load next hash */ +3: + poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly +4: + str qAadHash,[context] + str qCtr,[context,CTR_OFF] + cbnz len,24f +22: /* exit_with_popstack */ + pop_stack +20: /* exit_without_popstack */ + ret +23: /* partial_block_start */ + + generic_partial_block_start decrypt,len,in,out,context, \ + temp2,partial_block_length,temp0,temp1,hashkey_addr + cbnz partial_block_length,22b + ldr qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16] + ldr qHashKey0 ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32] + ldr qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \ + High,Low,Middle,Zero,Poly,Tmp0 + str qAadHash,[context] + cbz len,4b + cmp len,15 + bhi 21b +24: /* partial_block_end */ + add vCtr.4s,vCtr.4s,vOne.4s + read_small_data_start PartialBlock,in,len,temp0,Tmp0 + rev32 vEncCtr.16b,vCtr.16b + str qCtr,[context,CTR_OFF] + aes_encrypt_block EncCtr + eor vEncCtr.16b,vPartialBlock.16b,vEncCtr.16b + tbx_small_data_start EncCtr,PartialBlock,len,temp0,Tmp0 + write_small_data_start EncCtr,out,len,temp0,Tmp0 + str qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF] + str len,[context,PARTIAL_BLOCK_LENGTH_OFF] + pop_stack + ret +END_FUNC(dec,KEY_LEN,_update_) +END_FUNC(dec,KEY_LEN,_update_nt_) diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S new file mode 100644 index 000000000..4a3e990c3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S @@ -0,0 +1,134 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.arch armv8-a+crypto + + .text +/* +Macros +*/ +#define NUM_ROUNDS(a) (7+(a)/32) +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg +.endm +.macro round_128 off:req,rcon:req + .if \off == 0 + ldp w_tmp2,w_tmp3,[key,8] + ldp w_tmp0,w_tmp1,[key] + movi vzero.4s,0 + dup vsrc.4s,w_tmp3 + stp w_tmp2,w_tmp3,[exp_key_enc,8] + stp w_tmp0,w_tmp1,[exp_key_enc] + .endif + mov w0,\rcon + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp4,vdest.s[0] + eor w_tmp0,w_tmp0,w0 + eor w_tmp0,w_tmp0,w_tmp4,ror 8 + eor w_tmp1,w_tmp0,w_tmp1 + eor w_tmp2,w_tmp1,w_tmp2 + eor w_tmp3,w_tmp2,w_tmp3 + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*\off+KEY_LEN] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*\off+8+KEY_LEN] + .if \off != 10 + dup vsrc.4s,w_tmp3 + .endif +.endm +.macro export_dec_key rounds:req,enc_key:req,dec_key:req + ldr q0,[\enc_key] + ldr q1,[\enc_key,(\rounds-1)*16] + str q0,[\dec_key,(\rounds-1)*16] + str q1,[\dec_key] + ldp q0,q1,[\enc_key,1*16] + ldp q2,q3,[\enc_key,(1+2)*16] + ldp q4,q5,[\enc_key,(1+4)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ldp q6,q7,[\enc_key,(1+6)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-2)*16] + aesimc v4.16b,v4.16b + aesimc v5.16b,v5.16b + stp q3,q2,[\dec_key,(\rounds-1-4)*16] + ldr q0,[\enc_key,(1+8)*16] + aesimc v6.16b,v6.16b + aesimc v7.16b,v7.16b + stp q5,q4,[\dec_key,(\rounds-1-6)*16] + aesimc v0.16b,v0.16b + stp q7,q6,[\dec_key,(\rounds-1-8)*16] + str q0,[\dec_key,(\rounds-1-9)*16] +.endm +/** + void aes_keyexp_128_aes(const uint8_t * key, + uint8_t * exp_key_enc, uint8_t * exp_key_dec) +*/ + key .req x0 + exp_key_enc .req x1 + exp_key_dec .req x2 + .equ KEY_LEN, (128/8) + w_tmp0 .req w3 + w_tmp1 .req w4 + w_tmp2 .req w5 + w_tmp3 .req w6 + w_tmp4 .req w7 + declare_var_vector_reg dest,0 + declare_var_vector_reg zero,1 + declare_var_vector_reg src, 2 + + + .global aes_keyexp_128_aes + .type aes_keyexp_128_aes, %function + +aes_keyexp_128_aes: + .set rcon,1 + .set off,0 + .rept 10 + round_128 off,rcon + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + + export_dec_key NUM_ROUNDS(128),exp_key_enc,exp_key_dec + ret + .size aes_keyexp_128_aes, .-aes_keyexp_128_aes + .global aes_keyexp_128_enc_aes + .type aes_keyexp_128_enc_aes, %function +aes_keyexp_128_enc_aes: + .set rcon,1 + .set off,0 + .rept 10 + round_128 off,rcon + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + ret + .size aes_keyexp_128_enc_aes, .-aes_keyexp_128_enc_aes
\ No newline at end of file diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S new file mode 100644 index 000000000..2ba46060c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S @@ -0,0 +1,136 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text +/* +Macros +*/ +#define NUM_ROUNDS(a) (7+(a)/32) +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg +.endm +.macro round_192 off:req,rcon:req + .if \off == 0 + ldp w_tmp0,w_tmp1,[key] + ldp w_tmp2,w_tmp3,[key,8] + ldp w_tmp4,w_tmp5,[key,16] + movi vzero.4s,0 + dup vsrc.4s,w_tmp5 + stp w_tmp0,w_tmp1,[exp_key_enc] + stp w_tmp4,w_tmp5,[exp_key_enc,16] + stp w_tmp2,w_tmp3,[exp_key_enc,8] + .endif + mov w0,\rcon + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp,vdest.s[0] + eor w_tmp0,w_tmp0,w0 + eor w_tmp0,w_tmp0,w_tmp,ror 8 + eor w_tmp1,w_tmp0,w_tmp1 + eor w_tmp2,w_tmp1,w_tmp2 + eor w_tmp3,w_tmp2,w_tmp3 + .if \off < 7 + eor w_tmp4,w_tmp4,w_tmp3 + eor w_tmp5,w_tmp5,w_tmp4 + dup vsrc.4s,w_tmp5 + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + stp w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16] + .else + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + .endif +.endm + +.macro export_dec_key rounds:req,enc_key:req,dec_key:req + ldr q0,[\enc_key] + ldr q1,[\enc_key,(\rounds-1)*16] + str q0,[\dec_key,(\rounds-1)*16] + str q1,[\dec_key] + ldp q0,q1,[\enc_key,1*16] + ldp q2,q3,[\enc_key,(1+2)*16] + ldp q4,q5,[\enc_key,(1+4)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ldp q6,q7,[\enc_key,(1+6)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-2)*16] + ldp q0,q1,[\enc_key,(1+8)*16] + aesimc v4.16b,v4.16b + aesimc v5.16b,v5.16b + stp q3,q2,[\dec_key,(\rounds-1-4)*16] + aesimc v6.16b,v6.16b + aesimc v7.16b,v7.16b + stp q5,q4,[\dec_key,(\rounds-1-6)*16] + ldr q2,[\enc_key,(1+10)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + stp q7,q6,[\dec_key,(\rounds-1-8)*16] + aesimc v2.16b,v2.16b + stp q1,q0,[\dec_key,(\rounds-1-10)*16] + str q2,[\dec_key,(\rounds-1-11)*16] +.endm +/** + void aes_keyexp_192_aes(const uint8_t * key, + uint8_t * exp_key_enc, uint8_t * exp_key_dec) +*/ + key .req x0 + exp_key_enc .req x1 + exp_key_dec .req x2 + .equ KEY_LEN, (192/8) + w_tmp0 .req w3 + w_tmp1 .req w4 + w_tmp2 .req w5 + w_tmp3 .req w6 + w_tmp .req w7 + w_tmp4 .req w9 + w_tmp5 .req w10 + declare_var_vector_reg dest,0 + declare_var_vector_reg zero,1 + declare_var_vector_reg src, 2 + + + .global aes_keyexp_192_aes + .type aes_keyexp_192_aes, %function + +aes_keyexp_192_aes: + .set rcon,1 + .set off,0 + .rept 8 + round_192 off,rcon + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + export_dec_key NUM_ROUNDS(192),exp_key_enc,exp_key_dec + ret + .size aes_keyexp_192_aes, .-aes_keyexp_192_aes + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S new file mode 100644 index 000000000..5433b2ff6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S @@ -0,0 +1,153 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + + .text +/* +Macros +*/ +#define NUM_ROUNDS(a) (7+(a)/32) +.macro declare_var_vector_reg name:req,reg:req + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg +.endm +.macro round_256 off:req,rcon:req,export_dec_key + .if \off == 0 + ldp w_tmp6,w_tmp7,[key,24] + ldp w_tmp0,w_tmp1,[key] + ldp w_tmp2,w_tmp3,[key,8] + ldp w_tmp4,w_tmp5,[key,16] + movi vzero.4s,0 + dup vsrc.4s,w_tmp7 + stp w_tmp6,w_tmp7,[exp_key_enc,24] + stp w_tmp0,w_tmp1,[exp_key_enc] + stp w_tmp4,w_tmp5,[exp_key_enc,16] + stp w_tmp2,w_tmp3,[exp_key_enc,8] + .endif + mov w0,\rcon + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp,vdest.s[0] + eor w_tmp0,w_tmp0,w0 + eor w_tmp0,w_tmp0,w_tmp,ror 8 + eor w_tmp1,w_tmp0,w_tmp1 + eor w_tmp2,w_tmp1,w_tmp2 + eor w_tmp3,w_tmp2,w_tmp3 + .if \off < 6 + dup vsrc.4s,w_tmp3 + mov vdest.16b,vzero.16b + aese vdest.16b,vsrc.16b + mov w_tmp,vdest.s[0] + eor w_tmp4,w_tmp4,w_tmp + eor w_tmp5,w_tmp5,w_tmp4 + eor w_tmp6,w_tmp6,w_tmp5 + eor w_tmp7,w_tmp7,w_tmp6 + dup vsrc.4s,w_tmp7 + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + stp w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16] + stp w_tmp6,w_tmp7,[exp_key_enc,KEY_LEN*(\off+1)+24] + .else + stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)] + stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8] + .endif +.endm + +.macro export_dec_key rounds:req,enc_key:req,dec_key:req + ldr q0,[\enc_key] + ldr q1,[\enc_key,(\rounds-1)*16] + str q0,[\dec_key,(\rounds-1)*16] + str q1,[\dec_key] + ldp q0,q1,[\enc_key,1*16] + ldp q2,q3,[\enc_key,(1+2)*16] + ldp q4,q5,[\enc_key,(1+4)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + ldp q6,q7,[\enc_key,(1+6)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-2)*16] + ldp q0,q1,[\enc_key,(1+8)*16] + aesimc v4.16b,v4.16b + aesimc v5.16b,v5.16b + stp q3,q2,[\dec_key,(\rounds-1-4)*16] + ldp q2,q3,[\enc_key,(1+10)*16] + + aesimc v6.16b,v6.16b + aesimc v7.16b,v7.16b + stp q5,q4,[\dec_key,(\rounds-1-6)*16] + ldr q4,[\enc_key,(1+12)*16] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + stp q7,q6,[\dec_key,(\rounds-1-8)*16] + aesimc v2.16b,v2.16b + aesimc v3.16b,v3.16b + stp q1,q0,[\dec_key,(\rounds-1-10)*16] + aesimc v4.16b,v4.16b + stp q3,q2,[\dec_key,(\rounds-1-12)*16] + str q4,[\dec_key,(\rounds-1-13)*16] +.endm +/** + void aes_keyexp_256_aes(const uint8_t * key, + uint8_t * exp_key_enc, uint8_t * exp_key_dec) +*/ + key .req x0 + exp_key_enc .req x1 + exp_key_dec .req x2 + .equ KEY_LEN, (256/8) + w_tmp0 .req w3 + w_tmp1 .req w4 + w_tmp2 .req w5 + w_tmp3 .req w6 + w_tmp .req w7 + w_tmp4 .req w9 + w_tmp5 .req w10 + w_tmp6 .req w11 + w_tmp7 .req w12 + declare_var_vector_reg dest,0 + declare_var_vector_reg zero,1 + declare_var_vector_reg src, 2 + + + .global aes_keyexp_256_aes + .type aes_keyexp_256_aes, %function + +aes_keyexp_256_aes: + .set rcon,1 + .set off,0 + .rept 7 + round_256 off,rcon,1 + .set off,off+1 + .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b) + .endr + export_dec_key NUM_ROUNDS(256),exp_key_enc,exp_key_dec + ret + .size aes_keyexp_256_aes, .-aes_keyexp_256_aes + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c new file mode 100644 index 000000000..14c9889ac --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c @@ -0,0 +1,72 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_128_aes); + + return PROVIDER_BASIC(aes_keyexp_128); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128_enc) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_128_enc_aes); + + return PROVIDER_BASIC(aes_keyexp_128_enc); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_192) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_192_aes); + + return PROVIDER_BASIC(aes_keyexp_192); + +} + +DEFINE_INTERFACE_DISPATCHER(aes_keyexp_256) +{ + unsigned long auxval = getauxval(AT_HWCAP); + if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)) + return PROVIDER_INFO(aes_keyexp_256_aes); + + return PROVIDER_BASIC(aes_keyexp_256); + +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S new file mode 100644 index 000000000..aa7c32576 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S @@ -0,0 +1,35 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface aes_keyexp_128 +mbin_interface aes_keyexp_128_enc +mbin_interface aes_keyexp_192 +mbin_interface aes_keyexp_256 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c new file mode 100644 index 000000000..6c918858e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c @@ -0,0 +1,102 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include <aarch64_multibinary.h> + +#undef PROVIDER_BASIC +#define PROVIDER_BASIC(a) (void*)0 + +static unsigned long is_crypto_available(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_enc_ce); + } + return PROVIDER_BASIC(XTS_AES_128_enc); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_dec_ce); + } + return PROVIDER_BASIC(XTS_AES_128_dec); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_enc_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_128_enc_expanded_key); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_128_dec_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_128_dec_expanded_key); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_enc_ce); + } + return PROVIDER_BASIC(XTS_AES_256_enc); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_dec_ce); + } + return PROVIDER_BASIC(XTS_AES_256_dec); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_enc_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_256_enc_expanded_key); +} + +DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec_expanded_key) +{ + if (is_crypto_available()) { + return PROVIDER_INFO(XTS_AES_256_dec_expanded_key_ce); + } + return PROVIDER_BASIC(XTS_AES_256_dec_expanded_key); +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S new file mode 100644 index 000000000..318c1e8a4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S @@ -0,0 +1,214 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +.altmacro +.macro aes_key_expand_next next:req,prev:req,ctx:req + .if \next == 9 + mov \ctx, 0x1b + .endif + dup vdest.4s,vKey\prev\().s[3] + ext vtmp.16b,vzero.16b,vKey\prev\().16b,#12 + aese vdest.16b,vzero.16b + eor vKey\next\().16b,vKey\prev\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\next\().16b,vKey\next\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + mov tmpw,vdest.s[0] + eor tmpw,\ctx,tmpw,ror 8 + dup vdest.4s,tmpw + eor vKey\next\().16b,vKey\next\().16b,vtmp.16b + mov \ctx,ctx,lsl 1 + eor vKey\next\().16b,vKey\next\().16b,vdest.16b +.endm + +/* when loadin key = 0 + * arg1 = input key + * arg2 = rcon ctx register (optional) + * when loading key > 0 + * arg1 = rcon ctx register (optional) + */ +.macro aes_key_expand key:req,arg1,arg2 + .if \key == 0 + ld1 {vKey\key\().4s},[\arg1] + movi vzero.4s, 0 + .ifb \arg2 + mov rcon,#0x01 + .endif + + .ifnb \arg2 + mov \arg2,#0x01 + .endif + .endif + + .if \key > 0 + prev=\key-1 + .ifb \arg1 + aes_key_expand_next \key,%prev,rcon + .endif + + .ifnb \arg1 + aes_key_expand_next \key,%prev,\arg1 + .endif + .endif +.endm + +.macro aes_round block:req,key:req,mode:req + .if \key < 9 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + aesmc \block\().16b,\block\().16b + .else + aesd \block\().16b,vKey\key\().16b + aesimc \block\().16b,\block\().16b + .endif + .endif + .if \key == 9 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + .else + aesd \block\().16b,vKey\key\().16b + .endif + .endif + .if \key == 10 + eor \block\().16b,\block\().16b,vKey\key\().16b + .endif +.endm + +.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key + .if \key < 9 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + aesmc \b0\().16b,\b0\().16b + aese \b1\().16b,vKey\key\().16b + aesmc \b1\().16b,\b1\().16b + aese \b2\().16b,vKey\key\().16b + aesmc \b2\().16b,\b2\().16b + aese \b3\().16b,vKey\key\().16b + aesmc \b3\().16b,\b3\().16b + .else + aesd \b0\().16b,vKey\key\().16b + aesimc \b0\().16b,\b0\().16b + aesd \b1\().16b,vKey\key\().16b + aesimc \b1\().16b,\b1\().16b + aesd \b2\().16b,vKey\key\().16b + aesimc \b2\().16b,\b2\().16b + aesd \b3\().16b,vKey\key\().16b + aesimc \b3\().16b,\b3\().16b + .endif + .endif + + .if \key == 9 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aese \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aese \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aese \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .else + aesd \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aesd \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aesd \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aesd \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .endif + .endif +.endm + +.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode + aes_round_interleave \b0,\b1,\b2,\b3,0,\mode + aes_round_interleave \b0,\b1,\b2,\b3,1,\mode + aes_round_interleave \b0,\b1,\b2,\b3,2,\mode + aes_round_interleave \b0,\b1,\b2,\b3,3,\mode + aes_round_interleave \b0,\b1,\b2,\b3,4,\mode + aes_round_interleave \b0,\b1,\b2,\b3,5,\mode + aes_round_interleave \b0,\b1,\b2,\b3,6,\mode + aes_round_interleave \b0,\b1,\b2,\b3,7,\mode + aes_round_interleave \b0,\b1,\b2,\b3,8,\mode + aes_round_interleave \b0,\b1,\b2,\b3,9,\mode,10 +.endm + +.macro aes_rounds blk:req,mode:req + aes_round \blk,0,\mode + aes_round \blk,1,\mode + aes_round \blk,2,\mode + aes_round \blk,3,\mode + aes_round \blk,4,\mode + aes_round \blk,5,\mode + aes_round \blk,6,\mode + aes_round \blk,7,\mode + aes_round \blk,8,\mode + aes_round \blk,9,\mode + aes_round \blk,10,\mode +.endm + +/* load k1/k2 from memory and encrypt the tweak by k2 + * boths keys will share the same set of registers + * but will never overlap (k2 is used only once and discarded) + */ +.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req + ldp qKey0,qKey1,[\k2],#32 + aes_enc_round \iv,0 + ldp qKey2,qKey3,[\k2],#32 + aes_enc_round \iv,1 + ldp qKey0,qKey1,[\k1],#32 + aes_enc_round \iv,2 + ldp qKey4,qKey5,[\k2],#32 + aes_enc_round \iv,3 + ldp qKey2,qKey3,[\k1],#32 + aes_enc_round \iv,4 + ldp qKey6,qKey7,[\k2],#32 + aes_enc_round \iv,5 + ldp qKey4,qKey5,[\k1],#32 + aes_enc_round \iv,6 + ldp qKey8,qKey9,[k2],#32 + aes_enc_round \iv,7 + ldp qKey6,qKey7,[\k1],#32 + aes_enc_round \iv,8 + ld1 {vKey10.16b},[\k2],#16 + aes_enc_round \iv,9 + ldp qKey8,qKey9,[\k1],#32 + aes_enc_round \iv,10 + ld1 {vKey10.16b},[\k1],#16 +.endm + +.macro save_stack + stp d8,d9,[sp, -32]! + add tmpbuf,sp,16 +.endm + +.macro restore_stack + ldp d8,d9,[sp],32 +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S new file mode 100644 index 000000000..ceae2d3c0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S @@ -0,0 +1,116 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +.macro vswap vec1:req,vec2:req + mov vtmp.16b,\vec1\().16b + mov \vec1\().16b,\vec2\().16b + mov \vec2\().16b,vtmp.16b +.endm + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used only once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_key_expand 1 + aes_enc_round \iv,1 + aes_key_expand 0,\k1,rcon2 + aes_key_expand 2 + aes_enc_round \iv,2 + aes_key_expand 1,rcon2 + aes_key_expand 3 + aes_enc_round \iv,3 + aes_key_expand 2,rcon2 + aes_key_expand 4 + aes_enc_round \iv,4 + aes_key_expand 3,rcon2 + aes_key_expand 5 + aes_enc_round \iv,5 + aes_key_expand 4,rcon2 + aes_key_expand 6 + aes_enc_round \iv,6 + aes_key_expand 5,rcon2 + aes_key_expand 7 + aes_enc_round \iv,7 + aes_key_expand 6,rcon2 + aes_key_expand 8 + aes_enc_round \iv,8 + aes_key_expand 7,rcon2 + aes_key_expand 9 + aes_enc_round \iv,9 + aes_key_expand 8,rcon2 + aes_key_expand 10 + aes_enc_round \iv,10 + aes_key_expand 9,rcon2 + aes_key_expand 10,rcon2 + + // transform encryption key into decrption key + aesimc vKey1.16b,vKey1.16b + vswap vKey0,vKey10 + aesimc vKey9.16b,vKey9.16b + + aesimc vKey2.16b,vKey2.16b + aesimc vKey8.16b,vKey8.16b + vswap vKey1,vKey9 + + aesimc vKey3.16b,vKey3.16b + aesimc vKey7.16b,vKey7.16b + vswap vKey2,vKey8 + + aesimc vKey4.16b,vKey4.16b + aesimc vKey6.16b,vKey6.16b + vswap vKey3,vKey7 + + aesimc vKey5.16b,vKey5.16b + vswap vKey4,vKey6 +.endm + +/* + * void XTS_AES_128_dec_ce( + * uint8_t *k2, //!< key used for tweaking, 16 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_128_dec_ce + .type XTS_AES_128_dec_ce, %function +XTS_AES_128_dec_ce: + xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_128_dec_ce, .-XTS_AES_128_dec_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S new file mode 100644 index 000000000..23ed14a38 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S @@ -0,0 +1,91 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_key_expand 1 + aes_enc_round \iv,1 + aes_key_expand 0,\k1,rcon2 + aes_key_expand 2 + aes_enc_round \iv,2 + aes_key_expand 1,rcon2 + aes_key_expand 3 + aes_enc_round \iv,3 + aes_key_expand 2,rcon2 + aes_key_expand 4 + aes_enc_round \iv,4 + aes_key_expand 3,rcon2 + aes_key_expand 5 + aes_enc_round \iv,5 + aes_key_expand 4,rcon2 + aes_key_expand 6 + aes_enc_round \iv,6 + aes_key_expand 5,rcon2 + aes_key_expand 7 + aes_enc_round \iv,7 + aes_key_expand 6,rcon2 + aes_key_expand 8 + aes_enc_round \iv,8 + aes_key_expand 7,rcon2 + aes_key_expand 9 + aes_enc_round \iv,9 + aes_key_expand 8,rcon2 + aes_key_expand 10 + aes_enc_round \iv,10 + aes_key_expand 9,rcon2 + aes_key_expand 10,rcon2 +.endm + + +/* + * void XTS_AES_128_enc_ce( + * uint8_t *k2, //!< key used for tweaking, 16 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< cleartext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_128_enc_ce + .type XTS_AES_128_enc_ce, %function +XTS_AES_128_enc_ce: + xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_128_enc_ce, .-XTS_AES_128_enc_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S new file mode 100644 index 000000000..e6535dba3 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S @@ -0,0 +1,247 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +.altmacro +.macro aes_key_expand_next out0:req,out1:req,in0:req,in1:req,ctx:req + dup vdest.4s,vKey\in1\().s[3] + ext vtmp.16b,vzero.16b,vKey\in0\().16b,#12 + aese vdest.16b,vzero.16b + eor vKey\out0\().16b,vKey\in0\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\out0\().16b,vKey\out0\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + mov tmpw,vdest.s[0] + eor tmpw,\ctx,tmpw,ror 8 + dup vdest.4s,tmpw + eor vKey\out0\().16b,vKey\out0\().16b,vtmp.16b + mov \ctx,ctx,lsl 1 + eor vKey\out0\().16b,vKey\out0\().16b,vdest.16b + + .if \out1 < 14 + dup vdest.4s, vKey\out0\().s[3] + ext vtmp.16b, vzero.16b,vKey\in1\().16b,#12 + aese vdest.16b,vzero.16b + eor vKey\out1\().16b,vKey\in1\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\out1\().16b,vKey\out1\().16b,vtmp.16b + ext vtmp.16b,vzero.16b,vtmp.16b,#12 + eor vKey\out1\().16b,vKey\out1\().16b,vtmp.16b + eor vKey\out1\().16b,vKey\out1\().16b,vdest.16b + .endif +.endm + +/* when loadin key = 0 + * arg1 = input key + * arg2 = rcon ctx register (optional) + * when loading key > 0 + * arg1 = rcon ctx register (optional) + */ +.macro aes_key_expand key:req,arg1,arg2 + .if \key == 0 + ld1 {vKey0.4s,vKey1.4s},[\arg1] + movi vzero.4s, 0 + .ifb \arg2 + mov rcon,#0x01 + .endif + + .ifnb \arg2 + mov \arg2,#0x01 + .endif + .endif + + .if \key > 0 + in0=\key-2 + in1=\key-1 + out0=\key + out1=\key+1 + .ifb \arg1 + aes_key_expand_next %out0,%out1,%in0,%in1,rcon + .endif + + .ifnb \arg1 + aes_key_expand_next %out0,%out1,%in0,%in1,\arg1 + .endif + .endif +.endm + +.macro aes_round block:req,key:req,mode:req + .if \key < 13 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + aesmc \block\().16b,\block\().16b + .else + aesd \block\().16b,vKey\key\().16b + aesimc \block\().16b,\block\().16b + .endif + .endif + .if \key == 13 + .if mode == 0 + aese \block\().16b,vKey\key\().16b + .else + aesd \block\().16b,vKey\key\().16b + .endif + .endif + .if \key == 14 + eor \block\().16b,\block\().16b,vKey\key\().16b + .endif +.endm + +.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key + .if \key < 13 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + aesmc \b0\().16b,\b0\().16b + aese \b1\().16b,vKey\key\().16b + aesmc \b1\().16b,\b1\().16b + aese \b2\().16b,vKey\key\().16b + aesmc \b2\().16b,\b2\().16b + aese \b3\().16b,vKey\key\().16b + aesmc \b3\().16b,\b3\().16b + .else + aesd \b0\().16b,vKey\key\().16b + aesimc \b0\().16b,\b0\().16b + aesd \b1\().16b,vKey\key\().16b + aesimc \b1\().16b,\b1\().16b + aesd \b2\().16b,vKey\key\().16b + aesimc \b2\().16b,\b2\().16b + aesd \b3\().16b,vKey\key\().16b + aesimc \b3\().16b,\b3\().16b + .endif + .endif + + .if \key == 13 + .if \mode == 0 + aese \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aese \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aese \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aese \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .else + aesd \b0\().16b,vKey\key\().16b + eor \b0\().16b,\b0\().16b,vKey\last_key\().16b + aesd \b1\().16b,vKey\key\().16b + eor \b1\().16b,\b1\().16b,vKey\last_key\().16b + aesd \b2\().16b,vKey\key\().16b + eor \b2\().16b,\b2\().16b,vKey\last_key\().16b + aesd \b3\().16b,vKey\key\().16b + eor \b3\().16b,\b3\().16b,vKey\last_key\().16b + .endif + .endif +.endm + + + +.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode + aes_round_interleave \b0,\b1,\b2,\b3,0,\mode + aes_round_interleave \b0,\b1,\b2,\b3,1,\mode + aes_round_interleave \b0,\b1,\b2,\b3,2,\mode + aes_round_interleave \b0,\b1,\b2,\b3,3,\mode + aes_round_interleave \b0,\b1,\b2,\b3,4,\mode + aes_round_interleave \b0,\b1,\b2,\b3,5,\mode + aes_round_interleave \b0,\b1,\b2,\b3,6,\mode + aes_round_interleave \b0,\b1,\b2,\b3,7,\mode + aes_round_interleave \b0,\b1,\b2,\b3,8,\mode + aes_round_interleave \b0,\b1,\b2,\b3,9,\mode + aes_round_interleave \b0,\b1,\b2,\b3,10,\mode + aes_round_interleave \b0,\b1,\b2,\b3,11,\mode + aes_round_interleave \b0,\b1,\b2,\b3,12,\mode + aes_round_interleave \b0,\b1,\b2,\b3,13,\mode,14 +.endm + + +.macro aes_rounds blk:req,mode:req + aes_round \blk,0,\mode + aes_round \blk,1,\mode + aes_round \blk,2,\mode + aes_round \blk,3,\mode + aes_round \blk,4,\mode + aes_round \blk,5,\mode + aes_round \blk,6,\mode + aes_round \blk,7,\mode + aes_round \blk,8,\mode + aes_round \blk,9,\mode + aes_round \blk,10,\mode + aes_round \blk,11,\mode + aes_round \blk,12,\mode + aes_round \blk,13,\mode + aes_round \blk,14,\mode +.endm + +/* load k1/k2 from memory and encrypt the tweak by k2 + * boths keys will share the same set of registers + * but will never overlap (k2 is used only once and discarded) + */ +.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req + ldp qKey0,qKey1,[\k2],#32 + aes_enc_round \iv,0 + ldp qKey2,qKey3,[\k2],#32 + aes_enc_round \iv,1 + ldp qKey0,qKey1,[\k1],#32 + aes_enc_round \iv,2 + ldp qKey4,qKey5,[\k2],#32 + aes_enc_round \iv,3 + ldp qKey2,qKey3,[\k1],#32 + aes_enc_round \iv,4 + ldp qKey6,qKey7,[\k2],#32 + aes_enc_round \iv,5 + ldp qKey4,qKey5,[\k1],#32 + aes_enc_round \iv,6 + ldp qKey8,qKey9,[k2],#32 + aes_enc_round \iv,7 + ldp qKey6,qKey7,[\k1],#32 + aes_enc_round \iv,8 + ldp qKey10,qKey11,[k2],#32 + aes_enc_round \iv,9 + ldp qKey8,qKey9,[\k1],#32 + aes_enc_round \iv,10 + ldp qKey12,qKey13,[k2],#32 + aes_enc_round \iv,11 + ldp qKey10,qKey11,[\k1],#32 + aes_enc_round \iv,12 + ld1 {vKey14.16b},[k2],#16 + aes_enc_round \iv,13 + ldp qKey12,qKey13,[\k1],#32 + aes_enc_round \iv,14 + ld1 {vKey14.16b},[\k1],#16 +.endm + +.macro save_stack + stp d8,d9,[sp, -48]! + stp d10,d11,[sp, 16] + add tmpbuf,sp,32 +.endm + +.macro restore_stack + ldp d10,d11,[sp, 16] + ldp d8,d9,[sp], 48 +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S new file mode 100644 index 000000000..aa46ded08 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S @@ -0,0 +1,116 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +.macro vswap vec1:req,vec2:req + mov vtmp.16b,\vec1\().16b + mov \vec1\().16b,\vec2\().16b + mov \vec2\().16b,vtmp.16b +.endm + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used only once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_enc_round \iv,1 + aes_key_expand 2 + aes_key_expand 0,\k1,rcon2 + aes_enc_round \iv,2 + aes_enc_round \iv,3 + aes_key_expand 4 + aes_key_expand 2,rcon2 + aes_enc_round \iv,4 + aes_enc_round \iv,5 + aes_key_expand 6 + aes_key_expand 4,rcon2 + aes_enc_round \iv,6 + aes_enc_round \iv,7 + aes_key_expand 8 + aes_key_expand 6,rcon2 + aes_enc_round \iv,8 + aes_enc_round \iv,9 + aes_key_expand 10 + aes_key_expand 8,rcon2 + aes_enc_round \iv,10 + aes_enc_round \iv,11 + aes_key_expand 12 + aes_key_expand 10,rcon2 + aes_enc_round \iv,12 + aes_enc_round \iv,13 + aes_key_expand 14 + aes_key_expand 12,rcon2 + aes_enc_round \iv,14 + aes_key_expand 14,rcon2 + + // transform encryption key into decrption key + aesimc vKey1.16b,vKey1.16b + vswap vKey0,vKey14 + aesimc vKey13.16b,vKey13.16b + aesimc vKey2.16b,vKey2.16b + vswap vKey1,vKey13 + aesimc vKey12.16b,vKey12.16b + aesimc vKey3.16b,vKey3.16b + vswap vKey2,vKey12 + aesimc vKey11.16b,vKey11.16b + aesimc vKey4.16b,vKey4.16b + vswap vKey3,vKey11 + aesimc vKey10.16b,vKey10.16b + aesimc vKey5.16b,vKey5.16b + vswap vKey4,vKey10 + aesimc vKey9.16b,vKey9.16b + aesimc vKey6.16b,vKey6.16b + vswap vKey5,vKey9 + aesimc vKey8.16b,vKey8.16b + aesimc vKey7.16b,vKey7.16b + vswap vKey6,vKey8 +.endm + +/* + * void XTS_AES_256_dec_ce( + * uint8_t *k2, //!< key used for tweaking, 32 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 32 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_256_dec_ce + .type XTS_AES_256_dec_ce, %function +XTS_AES_256_dec_ce: + xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_256_dec_ce, .-XTS_AES_256_dec_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S new file mode 100644 index 000000000..8e4088a4d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S @@ -0,0 +1,88 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +/* encrypt the tweak by tweak key (k2), and at the same time + * to expand encryption key (k1) + * even though two sets of keys share the same set of registers + * they never overlap at any given time (k2 is used once and discarded) + */ +.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req + aes_key_expand 0,\k2 + aes_enc_round \iv,0 + aes_enc_round \iv,1 + aes_key_expand 2 + aes_key_expand 0,\k1,rcon2 + aes_enc_round \iv,2 + aes_enc_round \iv,3 + aes_key_expand 4 + aes_key_expand 2,rcon2 + aes_enc_round \iv,4 + aes_enc_round \iv,5 + aes_key_expand 6 + aes_key_expand 4,rcon2 + aes_enc_round \iv,6 + aes_enc_round \iv,7 + aes_key_expand 8 + aes_key_expand 6,rcon2 + aes_enc_round \iv,8 + aes_enc_round \iv,9 + aes_key_expand 10 + aes_key_expand 8,rcon2 + aes_enc_round \iv,10 + aes_enc_round \iv,11 + aes_key_expand 12 + aes_key_expand 10,rcon2 + aes_enc_round \iv,12 + aes_enc_round \iv,13 + aes_key_expand 14 + aes_key_expand 12,rcon2 + aes_enc_round \iv,14 + aes_key_expand 14,rcon2 +.endm + +/* + * void XTS_AES_256_enc_ce( + * uint8_t *k2, //!< key used for tweaking, 16 bytes + * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< cleartext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_256_enc_ce + .type XTS_AES_256_enc_ce, %function +XTS_AES_256_enc_ce: + xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1 + .size XTS_AES_256_enc_ce, .-XTS_AES_256_enc_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S new file mode 100644 index 000000000..c32a13820 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S @@ -0,0 +1,232 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +.macro declare_var_vector_reg name:req,reg:req +.ifdef q\name + .unreq q\name + .unreq v\name + .unreq s\name + .unreq d\name +.endif + .set q\name , \reg + q\name .req q\reg + v\name .req v\reg + s\name .req s\reg + d\name .req d\reg +.endm + +.macro declare_var_generic_reg name:req,reg:req + \name .req x\reg + x\name .req x\reg + w\name .req w\reg +.endm + + declare_var_vector_reg zero ,0 + declare_var_vector_reg tmp,1 + declare_var_vector_reg mask,2 + declare_var_vector_reg dest,3 + declare_var_vector_reg blk0,4 + declare_var_vector_reg blk1,5 + declare_var_vector_reg blk2,6 + declare_var_vector_reg blk3,7 + declare_var_vector_reg Key11,8 + declare_var_vector_reg Key12,9 + declare_var_vector_reg Key13,10 + declare_var_vector_reg Key14,11 + declare_var_vector_reg SavedIv,16 + declare_var_vector_reg IV0,17 + declare_var_vector_reg IV1,18 + declare_var_vector_reg IV2,19 + declare_var_vector_reg IV3,20 + declare_var_vector_reg Key0,21 + declare_var_vector_reg Key1,22 + declare_var_vector_reg Key2,23 + declare_var_vector_reg Key3,24 + declare_var_vector_reg Key4,25 + declare_var_vector_reg Key5,26 + declare_var_vector_reg Key6,27 + declare_var_vector_reg Key7,28 + declare_var_vector_reg Key8,29 + declare_var_vector_reg Key9,30 + declare_var_vector_reg Key10,31 + +.macro aes_enc_round block:req,key:req + aes_round \block,\key,0 +.endm + +.macro aes_dec_round block:req,key:req + aes_round \block,\key,1 +.endm + +.macro update_iv current:req,next:req + mov ivh,\current\().d[1] + mov ivl,\current\().d[0] + mov tmpw,#0x87 + extr tmpx2,ivh,ivh,#32 + extr ivh,ivh,ivl,#63 + and tmpw,tmpw,tmpw2,asr#31 + eor ivl,tmpx,ivl,lsl#1 + mov \next\().d[1],ivh + mov \next\().d[0],ivl +.endm + +.macro process_4_blks inp:req,outp:req,mode:req,is_tail + update_iv vIV0,vIV1 + update_iv vIV1,vIV2 + ldp qblk0,qblk1,[\inp],#32 + ldp qblk2,qblk3,[\inp],#32 + .ifnb \is_tail + update_iv vIV2, vSavedIv + update_iv vSavedIv,vIV3 + .else + update_iv vIV2,vIV3 + .endif + eor vblk0.16b,vblk0.16b,vIV0.16b + eor vblk1.16b,vblk1.16b,vIV1.16b + eor vblk2.16b,vblk2.16b,vIV2.16b + eor vblk3.16b,vblk3.16b,vIV3.16b + + aes_rounds_interleave vblk0,vblk1,vblk2,vblk3,\mode + eor vblk0.16b,vblk0.16b,vIV0.16b + eor vblk1.16b,vblk1.16b,vIV1.16b + stp qblk0,qblk1,[\outp],#32 + eor vblk2.16b,vblk2.16b,vIV2.16b + eor vblk3.16b,vblk3.16b,vIV3.16b + stp qblk2,qblk3,[\outp],#32 + .ifb \is_tail + update_iv vIV3,vIV0 + .endif +.endm + +.macro process_1_blk inp:req,outp:req,mode:req + ld1 {vblk0.16b},[\inp],#16 + eor vblk0.16b,vblk0.16b,vIV0.16b + aes_rounds vblk0,\mode + eor vblk0.16b,vblk0.16b,vIV0.16b + str qblk0,[\outp], #16 +.endm + + key2 .req x0 + key1 .req x1 + iv .req x2 + bytes .req x3 + inp .req x4 + outp .req x5 + rcon .req w6 + blocks .req x7 + tmpx .req x8 + tmpw .req w8 + tmpw2 .req w9 + tmpx2 .req x9 + ivl .req x10 + ivh .req x11 + lastblk .req x12 + tmpbuf .req x13 + tailcnt .req x14 + rcon2 .req w15 + +.macro xts_aes_crypt mode:req,expander,more:vararg + save_stack + + ld1 {vIV0.16b},[iv],16 + .ifnb \expander + \expander\() \more + .endif + lsr blocks,bytes,4 + and tailcnt,bytes,#0x0F + + cmp bytes,16 + b.lt .return + +.process_4_blks: + cmp blocks, 4 + b.lt .singles + subs blocks,blocks,4 + /* in decryption mode, check whether this is + * last block before the less-than-one-block tail + * need to swap tweak in this case + */ + .if \mode == 1 + b.gt .not_tail_4blk + cmp tailcnt,1 + b.lt .not_tail_4blk + process_4_blks inp,outp,\mode,1 + b .process_4_blks +.not_tail_4blk: + .endif + process_4_blks inp,outp,\mode + b .process_4_blks + +.singles: + subs blocks,blocks,#1 + b.lt .checktail + /* in decryption mode, check whether this is + *last block before the less-than-one-block tail + * need to swap tweak in this case + */ + .if \mode == 1 + b.gt .not_tail_1blk + cmp tailcnt,1 + b.lt .not_tail_1blk + mov vSavedIv.16b, vIV0.16b + update_iv vSavedIv, vIV0 + process_1_blk inp,outp,\mode + b .checktail +.not_tail_1blk: + .endif + process_1_blk inp,outp,\mode + update_iv vIV0,vIV0 + b .singles +.checktail: + cmp tailcnt,1 + b.lt .return + sub lastblk,outp,#16 +.copytail: + subs tailcnt,tailcnt,#1 + ldrb tmpw,[lastblk,tailcnt] + strb tmpw,[outp,tailcnt] + ldrb tmpw,[inp,tailcnt] + strb tmpw,[tmpbuf,tailcnt] + b.gt .copytail + and tailcnt,bytes,#0x0F +.steal: + cmp tailcnt,15 + ldrb tmpw,[lastblk,tailcnt] + strb tmpw,[tmpbuf,tailcnt] + add tailcnt,tailcnt,#1 + b.lt .steal + .if \mode == 1 + mov vIV0.16b,vSavedIv.16b + .endif + process_1_blk tmpbuf,lastblk,\mode +.return: + restore_stack + ret +.endm + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S new file mode 100644 index 000000000..9549ebfa0 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_128_dec_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes - encryption key is used + * uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_128_dec_expanded_key_ce + .type XTS_AES_128_dec_expanded_key_ce, %function +XTS_AES_128_dec_expanded_key_ce: + xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_128_dec_expanded_key_ce, .-XTS_AES_128_dec_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S new file mode 100644 index 000000000..1f2d2db2e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_128_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_128_enc_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes + * uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*11 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< plaintext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_128_enc_expanded_key_ce + .type XTS_AES_128_enc_expanded_key_ce, %function +XTS_AES_128_enc_expanded_key_ce: + xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_128_enc_expanded_key_ce, .-XTS_AES_128_enc_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S new file mode 100644 index 000000000..95c8bf63d --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_256_dec_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes - encryption key is used + * uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *ct, //!< ciphertext sector input data + * uint8_t *pt //!< plaintext sector output data + * ); +*/ + .global XTS_AES_256_dec_expanded_key_ce + .type XTS_AES_256_dec_expanded_key_ce, %function +XTS_AES_256_dec_expanded_key_ce: + xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_256_dec_expanded_key_ce, .-XTS_AES_256_dec_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S new file mode 100644 index 000000000..bd840a994 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a+crypto + .text + +#include "xts_aes_256_common.S" +#include "xts_aes_common.S" + +/* + * void XTS_AES_256_enc_expanded_key_ce( + * uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes + * uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*15 bytes + * uint8_t *TW_initial, //!< initial tweak value, 16 bytes + * uint64_t N, //!< sector size, in bytes + * const uint8_t *pt, //!< plaintext sector input data + * uint8_t *ct //!< ciphertext sector output data + * ); + */ + .global XTS_AES_256_enc_expanded_key_ce + .type XTS_AES_256_enc_expanded_key_ce, %function +XTS_AES_256_enc_expanded_key_ce: + xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1 + .size XTS_AES_256_enc_expanded_key_ce, .-XTS_AES_256_enc_expanded_key_ce diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S new file mode 100644 index 000000000..af77d885b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S @@ -0,0 +1,39 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aarch64_multibinary.h" + +mbin_interface XTS_AES_128_enc +mbin_interface XTS_AES_128_dec +mbin_interface XTS_AES_128_enc_expanded_key +mbin_interface XTS_AES_128_dec_expanded_key +mbin_interface XTS_AES_256_enc +mbin_interface XTS_AES_256_dec +mbin_interface XTS_AES_256_enc_expanded_key +mbin_interface XTS_AES_256_dec_expanded_key diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm new file mode 100644 index 000000000..22f00b395 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm @@ -0,0 +1,377 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _AES_COMMON_ASM_ +%define _AES_COMMON_ASM_ + +%include "reg_sizes.asm" + +;; ============================================================================= +;; Generic macro to produce code that executes %%OPCODE instruction +;; on selected number of AES blocks (16 bytes long ) between 0 and 16. +;; All three operands of the instruction come from registers. +;; Note: if 3 blocks are left at the end instruction is produced to operate all +;; 4 blocks (full width of ZMM) + +%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OPCODE %2 ; [in] instruction name +%define %%DST0 %3 ; [out] destination ZMM register +%define %%DST1 %4 ; [out] destination ZMM register +%define %%DST2 %5 ; [out] destination ZMM register +%define %%DST3 %6 ; [out] destination ZMM register +%define %%SRC1_0 %7 ; [in] source 1 ZMM register +%define %%SRC1_1 %8 ; [in] source 1 ZMM register +%define %%SRC1_2 %9 ; [in] source 1 ZMM register +%define %%SRC1_3 %10 ; [in] source 1 ZMM register +%define %%SRC2_0 %11 ; [in] source 2 ZMM register +%define %%SRC2_1 %12 ; [in] source 2 ZMM register +%define %%SRC2_2 %13 ; [in] source 2 ZMM register +%define %%SRC2_3 %14 ; [in] source 2 ZMM register + +%assign reg_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%DSTREG %%DST %+ reg_idx +%xdefine %%SRC1REG %%SRC1_ %+ reg_idx +%xdefine %%SRC2REG %%SRC2_ %+ reg_idx + %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG +%undef %%DSTREG +%undef %%SRC1REG +%undef %%SRC2REG +%assign reg_idx (reg_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep + +%xdefine %%DSTREG %%DST %+ reg_idx +%xdefine %%SRC1REG %%SRC1_ %+ reg_idx +%xdefine %%SRC2REG %%SRC2_ %+ reg_idx + +%if blocks_left == 1 + %%OPCODE XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG) +%elif blocks_left == 2 + %%OPCODE YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG) +%elif blocks_left == 3 + %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG +%endif + +%endmacro + +;; ============================================================================= +;; Loads specified number of AES blocks into ZMM registers +;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left +;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert) +;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded +%macro ZMM_LOAD_BLOCKS_0_16 7-8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%INP %2 ; [in] input data pointer to read from +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%DST0 %4 ; [out] ZMM register with loaded data +%define %%DST1 %5 ; [out] ZMM register with loaded data +%define %%DST2 %6 ; [out] ZMM register with loaded data +%define %%DST3 %7 ; [out] ZMM register with loaded data +%define %%FLAGS %8 ; [in] optional "load_4_instead_of_3" + +%assign src_offset 0 +%assign dst_idx 0 + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%DSTREG %%DST %+ dst_idx + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%undef %%DSTREG +%assign src_offset (src_offset + 64) +%assign dst_idx (dst_idx + 1) +%endrep + +%assign blocks_left (%%NUM_BLOCKS % 4) +%xdefine %%DSTREG %%DST %+ dst_idx + +%if blocks_left == 1 + vmovdqu8 XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 2 + vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 3 +%ifidn %%FLAGS, load_4_instead_of_3 + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%else + vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset] + vinserti64x2 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2 +%endif +%endif + +%endmacro + +;; ============================================================================= +;; Loads specified number of AES blocks into ZMM registers using mask register +;; for the last loaded register (xmm, ymm or zmm). +;; Loads take place at 1 byte granularity. +%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%INP %2 ; [in] input data pointer to read from +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%DST0 %4 ; [out] ZMM register with loaded data +%define %%DST1 %5 ; [out] ZMM register with loaded data +%define %%DST2 %6 ; [out] ZMM register with loaded data +%define %%DST3 %7 ; [out] ZMM register with loaded data +%define %%MASK %8 ; [in] mask register + +%assign src_offset 0 +%assign dst_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%if %%NUM_BLOCKS > 0 +%rep (((%%NUM_BLOCKS + 3) / 4) - 1) +%xdefine %%DSTREG %%DST %+ dst_idx + vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset] +%undef %%DSTREG +%assign src_offset (src_offset + 64) +%assign dst_idx (dst_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep +%endif ; %if %%NUM_BLOCKS > 0 + +%xdefine %%DSTREG %%DST %+ dst_idx + +%if blocks_left == 1 + vmovdqu8 XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%elif blocks_left == 2 + vmovdqu8 YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%elif (blocks_left == 3 || blocks_left == 4) + vmovdqu8 %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset] +%endif + +%endmacro + +;; ============================================================================= +;; Stores specified number of AES blocks from ZMM registers +%macro ZMM_STORE_BLOCKS_0_16 7 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OUTP %2 ; [in] output data pointer to write to +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%SRC0 %4 ; [in] ZMM register with data to store +%define %%SRC1 %5 ; [in] ZMM register with data to store +%define %%SRC2 %6 ; [in] ZMM register with data to store +%define %%SRC3 %7 ; [in] ZMM register with data to store + +%assign dst_offset 0 +%assign src_idx 0 + +%rep (%%NUM_BLOCKS / 4) +%xdefine %%SRCREG %%SRC %+ src_idx + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG +%undef %%SRCREG +%assign dst_offset (dst_offset + 64) +%assign src_idx (src_idx + 1) +%endrep + +%assign blocks_left (%%NUM_BLOCKS % 4) +%xdefine %%SRCREG %%SRC %+ src_idx + +%if blocks_left == 1 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG) +%elif blocks_left == 2 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG) +%elif blocks_left == 3 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG) + vextracti32x4 [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2 +%endif + +%endmacro + +;; ============================================================================= +;; Stores specified number of AES blocks from ZMM registers with mask register +;; for the last loaded register (xmm, ymm or zmm). +;; Stores take place at 1 byte granularity. +%macro ZMM_STORE_MASKED_BLOCKS_0_16 8 +%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16) +%define %%OUTP %2 ; [in] output data pointer to write to +%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical) +%define %%SRC0 %4 ; [in] ZMM register with data to store +%define %%SRC1 %5 ; [in] ZMM register with data to store +%define %%SRC2 %6 ; [in] ZMM register with data to store +%define %%SRC3 %7 ; [in] ZMM register with data to store +%define %%MASK %8 ; [in] mask register + +%assign dst_offset 0 +%assign src_idx 0 +%assign blocks_left %%NUM_BLOCKS + +%if %%NUM_BLOCKS > 0 +%rep (((%%NUM_BLOCKS + 3) / 4) - 1) +%xdefine %%SRCREG %%SRC %+ src_idx + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG +%undef %%SRCREG +%assign dst_offset (dst_offset + 64) +%assign src_idx (src_idx + 1) +%assign blocks_left (blocks_left - 4) +%endrep +%endif ; %if %%NUM_BLOCKS > 0 + +%xdefine %%SRCREG %%SRC %+ src_idx + +%if blocks_left == 1 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG) +%elif blocks_left == 2 + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG) +%elif (blocks_left == 3 || blocks_left == 4) + vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG +%endif + +%endmacro + +;;; =========================================================================== +;;; Handles AES encryption rounds +;;; It handles special cases: the last and first rounds +;;; Optionally, it performs XOR with data after the last AES round. +;;; Uses NROUNDS parameterto check what needs to be done for the current round. +;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12 +%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3 +%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7 +%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11 +%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15 +%define %%KEY %5 ; [in] zmm containing round key +%define %%ROUND %6 ; [in] round number +%define %%D0_3 %7 ; [in] zmm or no_data; plain/cipher text blocks 0-3 +%define %%D4_7 %8 ; [in] zmm or no_data; plain/cipher text blocks 4-7 +%define %%D8_11 %9 ; [in] zmm or no_data; plain/cipher text blocks 8-11 +%define %%D12_15 %10 ; [in] zmm or no_data; plain/cipher text blocks 12-15 +%define %%NUMBL %11 ; [in] number of blocks; numerical value +%define %%NROUNDS %12 ; [in] number of rounds; numerical value + +;;; === first AES round +%if (%%ROUND < 1) + ;; round 0 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; ROUND 0 + +;;; === middle AES rounds +%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS) + ;; rounds 1 to 9/11/13 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; rounds 1 to 9/11/13 + +;;; === last AES round +%if (%%ROUND > %%NROUNDS) + ;; the last round - mix enclast with text xor's + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY + +;;; === XOR with data +%ifnidn %%D0_3, no_data +%ifnidn %%D4_7, no_data +%ifnidn %%D8_11, no_data +%ifnidn %%D12_15, no_data + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%D0_3, %%D4_7, %%D8_11, %%D12_15 +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data + +%endif ; The last round + +%endmacro + +;;; =========================================================================== +;;; Handles AES decryption rounds +;;; It handles special cases: the last and first rounds +;;; Optionally, it performs XOR with data after the last AES round. +;;; Uses NROUNDS parameter to check what needs to be done for the current round. +;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). +%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12 +%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3 +%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7 +%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11 +%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15 +%define %%KEY %5 ; [in] zmm containing round key +%define %%ROUND %6 ; [in] round number +%define %%D0_3 %7 ; [in] zmm or no_data; cipher text blocks 0-3 +%define %%D4_7 %8 ; [in] zmm or no_data; cipher text blocks 4-7 +%define %%D8_11 %9 ; [in] zmm or no_data; cipher text blocks 8-11 +%define %%D12_15 %10 ; [in] zmm or no_data; cipher text blocks 12-15 +%define %%NUMBL %11 ; [in] number of blocks; numerical value +%define %%NROUNDS %12 ; [in] number of rounds; numerical value + +;;; === first AES round +%if (%%ROUND < 1) + ;; round 0 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; ROUND 0 + +;;; === middle AES rounds +%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS) + ;; rounds 1 to 9/11/13 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY +%endif ; rounds 1 to 9/11/13 + +;;; === last AES round +%if (%%ROUND > %%NROUNDS) + ;; the last round - mix enclast with text xor's + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%KEY, %%KEY, %%KEY, %%KEY + +;;; === XOR with data +%ifnidn %%D0_3, no_data +%ifnidn %%D4_7, no_data +%ifnidn %%D8_11, no_data +%ifnidn %%D12_15, no_data + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \ + %%D0_3, %%D4_7, %%D8_11, %%D12_15 +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data +%endif ; !no_data + +%endif ; The last round + +%endmacro + +%endif ;; _AES_COMMON_ASM diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm new file mode 100644 index 000000000..2a879abdd --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm @@ -0,0 +1,431 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; the following defines control the operation of the macros below and +; need to be defines in the including file +; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15 +; EARLY_BLOCKS - number of data block to load before starting computations +; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data +; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1 +; TMP_CNT - number of tmp xmm register to reserve +; XMM_USAGE - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2 +; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; +; the following instructions set specific macros must be defined in the user file +; to make use of the AES macros below +; MOVDQ - move from memory to xmm reg +; PXOR - XOR of two xmm registers pxor +; AES_DEC - AES block decode for early key rounds +; AES_DEC_LAST - AES block decode for last key round +; or +; AES_ENC - AES block encode for early key rounds +; AES_ENC_LAST - AES block encode for last key round + +; Three usages of xmm regs: key round cache, blocks data and one temp +; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg +%assign FIRST_XDATA (0) +%assign IV_IDX (FIRST_XDATA + PARALLEL_BLOCKS) +%ifndef IV_CNT +%define IV_CNT (1) +%endif +%assign TMP (IV_IDX + IV_CNT) +%assign TMP_CNT (2) +%assign FIRST_CKEY (TMP + TMP_CNT) +%assign CKEY_CNT (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT)) + +; Abstract xmm register usages that identify the expected contents of the register +%define reg(i) xmm %+ i +%define XDATA(i) xmm %+ i +%define KEY_REG(i) xmm %+ i +%define IV_REG(i) xmm %+ i + +%define IDX rax + + + + +; +; +; AES CBC ENCODE MACROS +; +; + +; +; CBC_DECRYPT_BLOCKS +; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro +; Finalized the decryption and saves results in the output +; places last last buffers crypto text in IV for next buffer +; updates the index and number of bytes left +; +%macro CBC_DECRYPT_BLOCKS 17 +%define %%TOT_ROUNDS %1 +%define %%num_blocks %2 ; can be 0..13 +%define %%EARLY_LOADS %3 ; number of data blocks to laod before processing +%define %%MOVDQ %4 +%define %%PXOR %5 +%define %%AES_DEC %6 +%define %%AES_DEC_LAST %7 +%define %%CACHED_KEYS %8 ; number of key data cached in xmm regs +%define %%TMP %9 +%define %%TMP_CNT %10 +%define %%FIRST_CKEY %11 +%define %%KEY_DATA %12 +%define %%FIRST_XDATA %13 +%define %%IN %14 ; input data +%define %%OUT %15 ; output data +%define %%IDX %16 ; index into input and output data buffers +%define %%LEN %17 + + AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX + + ; + ; XOR the result of each block's decrypt with the previous block's cypher text (C) + ; + %assign i 0 + %rep (%%num_blocks) + %%PXOR XDATA(i), XDATA(IV_IDX) ; XOR result with previous block's C + %%MOVDQ [%%OUT + %%IDX + i*16], XDATA(i) ; save plain text to out + %%MOVDQ XDATA(IV_IDX), [%%IN + IDX + i*16] ; load IV with current block C + %assign i (i+1) + %endrep + + add %%IDX, %%num_blocks*16 + sub %%LEN, %%num_blocks*16 +%endmacro + + +; +; CBC_ENC_INIT +; XOR first data block with the IV data +%macro CBC_ENC_INIT 7 +%define %%P_FIRST %1 +%define %%IV_IDX %2 +%define %%MOVDQ %3 +%define %%PXOR %4 +%define %%IV %5 +%define %%IN %6 ; input data +%define %%IDX %7 ; index into input and output data buffers + + %%MOVDQ XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16] + %%MOVDQ reg(%%IV_IDX), [%%IV] + %%PXOR XDATA(%%P_FIRST), reg(%%IV_IDX) +%endmacro + +; +; assumptions: +; LEN is length of data remaining +; IDX is offset into the data buffer +; +; subloops +; if data > 16 load next block into a next XDATA reg (XDATA(p_next)) +; load first uncached key into TMP0 (if any) +; AES block encript XDATA(P_FIRST) +; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST)) +; save current (XDATA(P_FIRST)) +; update indexes for P_FIRST +; end if data zero +; +%macro CBC_ENC_SUBLOOP 17 +%define %%TOT_ROUNDS %1 +%define %%BLOCKS %2 ; can be 1...14 +%define %%START_DATA %3 +%define %%MOVDQ %4 +%define %%PXOR %5 +%define %%AES_DEC %6 +%define %%AES_DEC_LAST %7 +%define %%TMP %8 +%define %%TMP_CNT %9 +%define %%FIRST_CKEY %10 +%define %%CKEY_CNT %11 +%define %%KEYS %12 +%define %%CACHED_KEYS %13 +%define %%IN %14 ; input data +%define %%OUT %15 ; output data +%define %%IDX %16 ; index into input and output data buffers +%define %%LEN %17 + + %assign this_blk 0 + %assign next_blk 1 + %assign p_first %%START_DATA + %assign p_next (p_first+1) + ; for number of blocks to be processed in a loop + %assign blk 1 + %rep %%BLOCKS + ; if data > 16 load next block into a next XDATA reg (XDATA(p_next)) + cmp %%LEN, 16 + %push skip_read + je %$skip_read_next + %%MOVDQ XDATA(p_next), [%%IN + %%IDX + next_blk*16] + %$skip_read_next: + %pop + + AES_ENC_BLOCKS %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST + + ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first)) + cmp %%LEN, 16 + %push skip_next + je %$skip_next_blk_start + %%PXOR XDATA(p_next), XDATA(p_first) + %$skip_next_blk_start: + %pop + + ; save current (XDATA(p_first)) + %%MOVDQ [%%OUT + %%IDX + this_blk*16], XDATA(p_first) + ; update indexes for p_first + add %%IDX, 16 + sub %%LEN, 16 + + %if (blk < %%BLOCKS) ; only insert jz if NOT last block + ; end if data zero + jz %%END_CBC_ENC_SUBLOOP + %endif ; (p_next < %%BLOCKS) + + %assign p_first (p_next) + %assign blk (blk+1) + %if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA + %assign p_next (%%START_DATA) + %elif (1 == %%BLOCKS) + %%MOVDQ XDATA(%%START_DATA), XDATA(p_next) + %else + %assign p_next (p_next+1) + %endif + %endrep ; %%BLOCKS + + %%END_CBC_ENC_SUBLOOP: +%endm ; CBC_ENC_SUBLOOP + + +; +; +; AES BLOCK ENCODE MACROS +; +; + +; +; FILL_KEY_CACHE +; Load key data into the cache key xmm regs +%macro FILL_KEY_CACHE 4 +%define %%CACHED_KEYS %1 +%define %%CKEY_START %2 +%define %%KEY_DATA %3 +%define %%MOVDQ %4 + + %assign rnd 0 + %rep KEY_ROUNDS + %if (rnd < %%CACHED_KEYS) ; find the round's key data + %assign c (rnd + %%CKEY_START) + %%MOVDQ KEY_REG(c), [%%KEY_DATA + rnd*16] ;load sub key into an available register + %endif + %assign rnd (rnd+1) + %endrep +%endmacro + +; +; SCHEDULE_DATA_LOAD +; pre-loades message data into xmm regs +; updates global 'blocks_loaded' that tracks which data blocks have been loaded +; 'blocks_loaded' is an in/out global and must be declared in the using macro or function +%macro SCHEDULE_DATA_LOAD 5 +%define %%PARALLEL_DATA %1 +%define %%EARLY_LOADS %2 +%define %%MOVDQ %3 +%define %%IN %4 +%define %%IDX %5 + + %if (blocks_loaded < %%PARALLEL_DATA) + ; load cipher text + %%MOVDQ XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16] + %assign blocks_loaded (blocks_loaded+1) + %endif ; (blocks_loaded < %%PARALLEL_DATA) +%endmacro ; SCHEDULED_EARLY_DATA_LOADS + +; +; INIT_SELECT_KEY +; determine which xmm reg holds the key data needed or loades it into the temp register if not cached +; 'current_tmp' is an in/out global and must be declared in the using macro or function +%macro INIT_SELECT_KEY 6 +%define %%TOT_ROUNDS %1 +%define %%CACHED_KEYS %2 +%define %%KEY_DATA %3 +%define %%FIRST_TMP %4 +%define %%TMP_CNT %5 +%define %%MOVDQ %6 + + %assign current_tmp (%%FIRST_TMP) + %if (%%TOT_ROUNDS > %%CACHED_KEYS) ; load the first uncached key into temp reg + %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16] + %endif ; (KEY_ROUNDS > CKEY_CNT) +%endmacro ; SELECT_KEY + +; +; SELECT_KEY +; determine which xmm reg holds the key data needed or loades it into the temp register if not cached +; 'current_tmp' is an in/out global and must be declared in the using macro or function +%macro SELECT_KEY 8 +%define %%ROUND %1 +%define %%TOT_ROUNDS %2 +%define %%CACHED_KEYS %3 +%define %%FIRST_KEY %4 +%define %%KEY_DATA %5 +%define %%FIRST_TMP %6 +%define %%TMP_CNT %7 +%define %%MOVDQ %8 + + ; find the key data for this round + %if (%%ROUND < %%CACHED_KEYS) ; is it cached + %assign key (%%ROUND + %%FIRST_KEY) + %else + ; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one + %assign key (current_tmp) ; use the previous loaded key data + %if (1 == %%TMP_CNT) + %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data + %else + %assign next_round (%%ROUND+1) + %if (next_round < %%TOT_ROUNDS) ; if more rounds to be done + %if (current_tmp == %%FIRST_TMP) ; calc the next temp reg to use + %assign current_tmp (current_tmp + 1) + %else + %assign current_tmp (%%FIRST_TMP) + %endif ; (current_tmp == %%FIRST_TMP) + %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data + + %endif ; (%%ROUND < KEY_ROUNDS) + %endif ; (1 < %%TMP_CNT) + %endif ; (%%ROUND < %%CACHED_KEYS) +%endmacro ; SELECT_KEY + + +; +; AES_PARALLEL_ENC_BLOCKS +; preloads some data blocks to be worked on +; starts the aes block encoding while loading the other blocks to be done in parallel +; aes block encodes each key round on each block +%macro AES_PARALLEL_ENC_BLOCKS 16 +%define %%KEY_ROUNDS %1 +%define %%PARALLEL_DATA %2 +%define %%EARLY_LOADS %3 +%define %%MOVDQ %4 +%define %%PXOR %5 +%define %%AES_DEC %6 +%define %%AES_DEC_LAST %7 +%define %%CACHED_KEYS %8 +%define %%TMP %9 +%define %%TMP_CNT %10 +%define %%FIRST_CKEY %11 +%define %%KEY_DATA %12 +%define %%FIRST_XDATA %13 +%define %%IN %14 ; input data +%define %%OUT %15 ; output data +%define %%IDX %16 ; index into input and output data buffers + + %assign blocks_loaded 0 + + %rep %%EARLY_LOADS + SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded + %endrep ; %%EARLY_LOADS + + %assign current_tmp (TMP) + INIT_SELECT_KEY %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + %assign round 0 + %assign key 0 + %rep KEY_ROUNDS ; for all key rounds + SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + %assign i %%FIRST_XDATA + %rep %%PARALLEL_DATA ; for each block do the EAS block encode step + %if (0 == round) + %%PXOR XDATA(i), KEY_REG(key) ; first round's step + SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX + + %elif ( (%%KEY_ROUNDS-1) == round ) + %%AES_DEC_LAST XDATA(i), KEY_REG(key) ; last round's step + + %else + %%AES_DEC XDATA(i), KEY_REG(key) ; middle round's (1..last-1) step + + %endif + %assign i (i+1) + %endrep ;%%PARALLEL_DATA + %assign round (round+1) + %endrep ;KEY_ROUNDS +%endmacro ; AES_PARALLEL_ENC_BLOCKS + + + +; +; AES_ENC_BLOCKS +; load first uncached key into TMP0 (if any) +; AES block encript XDATA(p_first) +; before using uncached key in TMP0, load next key in TMP1 +; before using uncached key in TMP1, load next key in TMP0 +%macro AES_ENC_BLOCKS 11 +%define %%TOT_ROUNDS %1 +%define %%ENC_BLOCK %2 +%define %%TMP %3 +%define %%TMP_CNT %4 +%define %%FIRST_CKEY %5 +%define %%CACHED_KEYS %6 +%define %%KEY_DATA %7 +%define %%MOVDQ %8 +%define %%PXOR %9 +%define %%AES_ENC %10 +%define %%AES_ENC_LAST %11 + + %assign current_tmp (%%TMP) + INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + %assign round 0 + %assign key (round + %%FIRST_CKEY) + %rep %%TOT_ROUNDS ; for all key rounds + ; find the key data for this round + SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ + + ; encrypt block + %if (0 == round) + %%PXOR XDATA(%%ENC_BLOCK), KEY_REG(key) ; round zero step + %elif ( (%%TOT_ROUNDS-1) == round ) + %%AES_ENC_LAST XDATA(%%ENC_BLOCK), KEY_REG(key) ; last round's step + %else + %%AES_ENC XDATA(%%ENC_BLOCK), KEY_REG(key) ; rounds 1..last-1 step + %endif ; (0 == round) + + %assign round (round+1) + %endrep ; KEY_ROUNDS +%endmacro ; AES_ENC + + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm new file mode 100644 index 000000000..68aa227ca --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm @@ -0,0 +1,162 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_128_sse(void *in, +; uint8_t *IV, +; uint8_t keys, +; void *out, +; uint64_t len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro + +%endif + +; configuration paramaters for AES-CBC macros +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (2) +%define PARALLEL_BLOCKS (8) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_DEC aesdec +%define AES_DEC_LAST aesdeclast +%include "cbc_common.asm" + +section .text + +align 16 +mk_global aes_cbc_dec_128_sse, function +func(aes_cbc_dec_128_sse) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm new file mode 100644 index 000000000..d4b6dfb2a --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm @@ -0,0 +1,162 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; routine to do AES128 CBC decrypt +;; clobbers xmm0-15 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro + +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_DEC 2 + vaesdec %1, %1, %2 +%endm + +%macro AES_DEC_LAST 2 + vaesdeclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + +section .text + +;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +mk_global aes_cbc_dec_128_avx, function +func(aes_cbc_dec_128_avx) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm new file mode 100644 index 000000000..4b017d193 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm @@ -0,0 +1,164 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES cbc decrypt on 16n bytes doing AES +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_192_sse(void *in, +; uint8_t *IV, +; uint8_t keys[13], // +1 over key length +; void *out, +; uint64_t len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; + +%include "reg_sizes.asm" + +%define MOVDQ movdqu + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro + +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (2) +%define PARALLEL_BLOCKS (5) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_DEC aesdec +%define AES_DEC_LAST aesdeclast + +%include "cbc_common.asm" + +section .text + +mk_global aes_cbc_dec_192_sse, function +func(aes_cbc_dec_192_sse) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm new file mode 100644 index 000000000..2791570ad --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm @@ -0,0 +1,158 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES192 CBC decrypt + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_DEC 2 + vaesdec %1, %1, %2 +%endm + +%macro AES_DEC_LAST 2 + vaesdeclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +mk_global aes_cbc_dec_192_avx, function +func(aes_cbc_dec_192_avx) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm new file mode 100644 index 000000000..44c76268e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm @@ -0,0 +1,161 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES cbc decrypt on 16n bytes doing AES +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_256_sse(void *in, +; uint8_t *IV, +; uint8_t keys, +; void *out, +; uint64_t len_bytes); +; +; arg 1: rcx: pointer to input (cipher text) +; arg 2: rdx: pointer to IV +; arg 3: r8: pointer to keys +; arg 4: r9: pointer to output (plain text) +; arg 5: sp: length in bytes (multiple of 16) +; + +%include "reg_sizes.asm" + +%define MOVDQ movdqu + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_DEC aesdec +%define AES_DEC_LAST aesdeclast + +%include "cbc_common.asm" + +mk_global aes_cbc_dec_256_sse, function +func(aes_cbc_dec_256_sse) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm new file mode 100644 index 000000000..cad1a6bef --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm @@ -0,0 +1,158 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; routine to do AES256 CBC decrypt + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +; configuration paramaters for AES-CBC +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%define EARLY_BLOCKS (4) +%define PARALLEL_BLOCKS (11) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_DEC 2 + vaesdec %1, %1, %2 +%endm + +%macro AES_DEC_LAST 2 + vaesdeclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +mk_global aes_cbc_dec_256_avx, function +func(aes_cbc_dec_256_avx) + endbranch + FUNC_SAVE + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + + MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt + mov IDX, 0 + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; if enough data blocks remain enter main_loop + jmp partials + +main_loop: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + cmp LEN, PARALLEL_BLOCKS*16 + jge main_loop ; enough blocks to do another full parallel set + jz done + +partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1 + cmp LEN, 0 + je done + cmp LEN, 4*16 + jge initial_4 + cmp LEN, 2*16 + jge initial_2 + +initial_1: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jmp done + +initial_2: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jz done + jmp partials + +initial_4: + CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN + jnz partials +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm new file mode 100644 index 000000000..6124e2def --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm @@ -0,0 +1,519 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2019-2021 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "aes_common.asm" +%include "reg_sizes.asm" + +%if (AS_FEATURE_LEVEL) >= 10 + +[bits 64] +default rel + +%define zIV zmm0 +%define zBLK_0_3 zmm1 +%define zBLK_4_7 zmm2 +%define zBLK_8_11 zmm3 +%define zBLK_12_15 zmm4 +%define zTMP0 zmm5 +%define zTMP1 zmm6 +%define zTMP2 zmm7 +%define zTMP3 zmm8 + +%define ZKEY0 zmm17 +%define ZKEY1 zmm18 +%define ZKEY2 zmm19 +%define ZKEY3 zmm20 +%define ZKEY4 zmm21 +%define ZKEY5 zmm22 +%define ZKEY6 zmm23 +%define ZKEY7 zmm24 +%define ZKEY8 zmm25 +%define ZKEY9 zmm26 +%define ZKEY10 zmm27 +%define ZKEY11 zmm28 +%define ZKEY12 zmm29 +%define ZKEY13 zmm30 +%define ZKEY14 zmm31 + +%ifidn __OUTPUT_FORMAT__, elf64 +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 +%define tmp2 r11 + +%ifdef CBCS +%define OFFSET 160 +%else +%define OFFSET 16 +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; macro to preload keys +;;; - uses ZKEY[0-14] registers (ZMM) +%macro LOAD_KEYS 2 +%define %%KEYS %1 ; [in] key pointer +%define %%NROUNDS %2 ; [in] numerical value, number of AES rounds + ; excluding 1st and last rounds. + ; Example: AES-128 -> value 9 + +%assign i 0 +%rep (%%NROUNDS + 2) + vbroadcastf64x2 ZKEY %+ i, [%%KEYS + 16*i] +%assign i (i + 1) +%endrep + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to "cool down" pipeline after DECRYPT_16_PARALLEL macro +;;; code as the number of final blocks is variable. +;;; Processes the last %%num_final_blocks blocks (1 to 15, can't be 0) + +%macro FINAL_BLOCKS 14 +%define %%PLAIN_OUT %1 ; [in] output buffer +%define %%CIPH_IN %2 ; [in] input buffer +%define %%LAST_CIPH_BLK %3 ; [in/out] ZMM with IV/last cipher blk (in idx 3) +%define %%num_final_blocks %4 ; [in] numerical value (1 - 15) +%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks +%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks +%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks +%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks +%define %%ZT1 %9 ; [clobbered] ZMM temporary +%define %%ZT2 %10 ; [clobbered] ZMM temporary +%define %%ZT3 %11 ; [clobbered] ZMM temporary +%define %%ZT4 %12 ; [clobbered] ZMM temporary +%define %%IA0 %13 ; [clobbered] GP temporary +%define %%NROUNDS %14 ; [in] number of rounds; numerical value + + ;; load plain/cipher text +%ifdef CBCS + ZMM_LOAD_BLOCKS_0_16_OFFSET %%num_final_blocks, %%CIPH_IN, \ + OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + ZMM_LOAD_BLOCKS_0_16 %%num_final_blocks, %%CIPH_IN, 0, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%endif + ;; Prepare final cipher text blocks to + ;; be XOR'd later after AESDEC + valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6 +%if %%num_final_blocks > 4 + valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6 +%endif +%if %%num_final_blocks > 8 + valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6 +%endif +%if %%num_final_blocks > 12 + valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6 +%endif + + ;; Update IV with last cipher block + ;; to be used later in DECRYPT_16_PARALLEL +%if %%num_final_blocks == 1 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 2 +%elif %%num_final_blocks == 2 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 4 +%elif %%num_final_blocks == 3 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 6 +%elif %%num_final_blocks == 4 + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3 +%elif %%num_final_blocks == 5 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 2 +%elif %%num_final_blocks == 6 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 4 +%elif %%num_final_blocks == 7 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 6 +%elif %%num_final_blocks == 8 + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7 +%elif %%num_final_blocks == 9 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 2 +%elif %%num_final_blocks == 10 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 4 +%elif %%num_final_blocks == 11 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 6 +%elif %%num_final_blocks == 12 + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11 +%elif %%num_final_blocks == 13 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 2 +%elif %%num_final_blocks == 14 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 4 +%elif %%num_final_blocks == 15 + valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 6 +%endif + + ;; AES rounds +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \ + ZKEY %+ j, j, no_data, no_data, no_data, no_data, \ + %%num_final_blocks, %%NROUNDS +%assign j (j + 1) +%endrep + + ;; XOR with decrypted blocks to get plain text + vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1 +%if %%num_final_blocks > 4 + vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2 +%endif +%if %%num_final_blocks > 8 + vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3 +%endif +%if %%num_final_blocks > 12 + vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4 +%endif + + ;; write plain text back to output +%ifdef CBCS + ZMM_STORE_BLOCKS_0_16_OFFSET %%num_final_blocks, %%PLAIN_OUT, \ + OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + ZMM_STORE_BLOCKS_0_16 %%num_final_blocks, %%PLAIN_OUT, 0, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%endif + +%endmacro ; FINAL_BLOCKS + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main AES-CBC decrypt macro +;;; - operates on single stream +;;; - decrypts 16 blocks at a time +%macro DECRYPT_16_PARALLEL 14 +%define %%PLAIN_OUT %1 ; [in] output buffer +%define %%CIPH_IN %2 ; [in] input buffer +%define %%LENGTH %3 ; [in/out] number of bytes to process +%define %%LAST_CIPH_BLK %4 ; [in/out] ZMM with IV (first block) or last cipher block (idx 3) +%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks +%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks +%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks +%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks +%define %%ZT1 %9 ; [clobbered] ZMM temporary +%define %%ZT2 %10 ; [clobbered] ZMM temporary +%define %%ZT3 %11 ; [clobbered] ZMM temporary +%define %%ZT4 %12 ; [clobbered] ZMM temporary +%define %%NROUNDS %13 ; [in] number of rounds; numerical value +%define %%IA0 %14 ; [clobbered] GP temporary + +%ifdef CBCS + ZMM_LOAD_BLOCKS_0_16_OFFSET 16, %%CIPH_IN, OFFSET, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + vmovdqu8 %%CIPHER_PLAIN_0_3, [%%CIPH_IN] + vmovdqu8 %%CIPHER_PLAIN_4_7, [%%CIPH_IN + 64] + vmovdqu8 %%CIPHER_PLAIN_8_11, [%%CIPH_IN + 128] + vmovdqu8 %%CIPHER_PLAIN_12_15, [%%CIPH_IN + 192] +%endif + ;; prepare first set of cipher blocks for later XOR'ing + valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6 + valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6 + valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6 + valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6 + + ;; store last cipher text block to be used for next 16 blocks + vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15 + + ;; AES rounds +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \ + ZKEY %+ j, j, no_data, no_data, no_data, no_data, \ + 16, %%NROUNDS +%assign j (j + 1) +%endrep + + ;; XOR with decrypted blocks to get plain text + vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1 + vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2 + vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3 + vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4 + + ;; write plain text back to output +%ifdef CBCS + ZMM_STORE_BLOCKS_0_16_OFFSET 16, %%PLAIN_OUT, OFFSET, \ + %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \ + %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15 +%else + vmovdqu8 [%%PLAIN_OUT], %%CIPHER_PLAIN_0_3 + vmovdqu8 [%%PLAIN_OUT + 64], %%CIPHER_PLAIN_4_7 + vmovdqu8 [%%PLAIN_OUT + 128], %%CIPHER_PLAIN_8_11 + vmovdqu8 [%%PLAIN_OUT + 192], %%CIPHER_PLAIN_12_15 +%endif + ;; adjust input pointer and length + sub %%LENGTH, (16 * 16) + add %%CIPH_IN, (16 * OFFSET) + add %%PLAIN_OUT, (16 * OFFSET) + +%endmacro ; DECRYPT_16_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; AES_CBC_DEC macro decrypts given data. +;;; Flow: +;;; - Decrypt all blocks (multiple of 16) up to final 1-15 blocks +;;; - Decrypt final blocks (1-15 blocks) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro AES_CBC_DEC 7 +%define %%CIPH_IN %1 ;; [in] pointer to input buffer +%define %%PLAIN_OUT %2 ;; [in] pointer to output buffer +%define %%KEYS %3 ;; [in] pointer to expanded keys +%define %%IV %4 ;; [in] pointer to IV +%define %%LENGTH %5 ;; [in/out] GP register with length in bytes +%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value +%define %%TMP %7 ;; [clobbered] GP register + + cmp %%LENGTH, 0 + je %%cbc_dec_done + + vinserti64x2 zIV, zIV, [%%IV], 3 + + ;; preload keys + LOAD_KEYS %%KEYS, %%NROUNDS + +%%decrypt_16_parallel: + cmp %%LENGTH, 256 + jb %%final_blocks + + DECRYPT_16_PARALLEL %%PLAIN_OUT, %%CIPH_IN, %%LENGTH, zIV, \ + zBLK_0_3, zBLK_4_7, zBLK_8_11, zBLK_12_15, \ + zTMP0, zTMP1, zTMP2, zTMP3, %%NROUNDS, %%TMP + jmp %%decrypt_16_parallel + +%%final_blocks: + ;; get num final blocks + shr %%LENGTH, 4 + and %%LENGTH, 0xf + je %%cbc_dec_done + + cmp %%LENGTH, 8 + je %%final_num_blocks_is_8 + jl %%final_blocks_is_1_7 + + ; Final blocks 9-15 + cmp %%LENGTH, 12 + je %%final_num_blocks_is_12 + jl %%final_blocks_is_9_11 + + ; Final blocks 13-15 + cmp %%LENGTH, 15 + je %%final_num_blocks_is_15 + cmp %%LENGTH, 14 + je %%final_num_blocks_is_14 + cmp %%LENGTH, 13 + je %%final_num_blocks_is_13 + +%%final_blocks_is_9_11: + cmp %%LENGTH, 11 + je %%final_num_blocks_is_11 + cmp %%LENGTH, 10 + je %%final_num_blocks_is_10 + cmp %%LENGTH, 9 + je %%final_num_blocks_is_9 + +%%final_blocks_is_1_7: + cmp %%LENGTH, 4 + je %%final_num_blocks_is_4 + jl %%final_blocks_is_1_3 + + ; Final blocks 5-7 + cmp %%LENGTH, 7 + je %%final_num_blocks_is_7 + cmp %%LENGTH, 6 + je %%final_num_blocks_is_6 + cmp %%LENGTH, 5 + je %%final_num_blocks_is_5 + +%%final_blocks_is_1_3: + cmp %%LENGTH, 3 + je %%final_num_blocks_is_3 + cmp %%LENGTH, 2 + je %%final_num_blocks_is_2 + jmp %%final_num_blocks_is_1 + + +%%final_num_blocks_is_15: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 15, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_14: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 14, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_13: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 13, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_12: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 12, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_11: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 11, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_10: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 10, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_9: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 9, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_8: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 8, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_7: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 7, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_6: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 6, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_5: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 5, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_4: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 4, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_3: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 3, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_2: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 2, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + jmp %%cbc_dec_done + +%%final_num_blocks_is_1: + FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 1, zBLK_0_3, zBLK_4_7, \ + zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \ + %%TMP, %%NROUNDS + +%%cbc_dec_done: +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +%ifndef CBCS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_128_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +mk_global aes_cbc_dec_128_vaes_avx512,function,internal +aes_cbc_dec_128_vaes_avx512: + endbranch +%ifidn __OUTPUT_FORMAT__, win64 + mov num_bytes, [rsp + 8*5] +%endif + AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 9, tmp + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_192_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +mk_global aes_cbc_dec_192_vaes_avx512,function,internal +aes_cbc_dec_192_vaes_avx512: + endbranch +%ifidn __OUTPUT_FORMAT__, win64 + mov num_bytes, [rsp + 8*5] +%endif + AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 11, tmp + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aes_cbc_dec_256_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +mk_global aes_cbc_dec_256_vaes_avx512,function,internal +aes_cbc_dec_256_vaes_avx512: + endbranch +%ifidn __OUTPUT_FORMAT__, win64 + mov num_bytes, [rsp + 8*5] +%endif + AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 13, tmp + + ret + +%endif ;; CBCS + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_aes_cbc_dec_256_vaes_avx512 +no_aes_cbc_dec_256_vaes_avx512: +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm new file mode 100644 index 000000000..a7fbf39b4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm @@ -0,0 +1,137 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 128 bit CBC AES encrypt +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_128_x4(void *in, +;; uint8_t *IV, +;; uint8_t *keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%define UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) +%define EARLY_BLOCKS (2) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_ENC aesenc +%define AES_ENC_LAST aesenclast + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_128_x4, function +func(aes_cbc_enc_128_x4) + endbranch + FUNC_SAVE + + mov IDX, 0 + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm new file mode 100644 index 000000000..24ab33fe5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm @@ -0,0 +1,151 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 128 bit CBC AES encrypt +;; clobbers all registers except for ARG1 and rbp + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_256_x8(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;; clobbers all registers except for ARG1 and rbp + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 11 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) +%define IV_CNT (1) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_ENC 2 + vaesenc %1, %1, %2 +%endm + +%macro AES_ENC_LAST 2 + vaesenclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_128_x8, function +func(aes_cbc_enc_128_x8) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm new file mode 100644 index 000000000..b3d80e922 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm @@ -0,0 +1,149 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 192 bit CBC AES encrypt +;;; Updates In and Out pointers at end + +;include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_192_x4(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_ENC aesenc +%define AES_ENC_LAST aesenclast + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_192_x4, function +func(aes_cbc_enc_192_x4) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm new file mode 100644 index 000000000..89d233819 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm @@ -0,0 +1,147 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 192 bit CBC AES encrypt +;; clobbers all registers except for ARG1 and rbp +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_192_x8(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;; clobbers all registers except for ARG1 and rbp + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 13 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_ENC 2 + vaesenc %1, %1, %2 +%endm + +%macro AES_ENC_LAST 2 + vaesenclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + +mk_global aes_cbc_enc_192_x8, function +func(aes_cbc_enc_192_x8) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm new file mode 100644 index 000000000..ab37668c7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm @@ -0,0 +1,141 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 256 bit CBC AES encrypt +;;; Updates In and Out pointers at end + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_256_x4(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + save_xmm128 xmm6, 0*16 + save_xmm128 xmm7, 1*16 + save_xmm128 xmm8, 2*16 + save_xmm128 xmm9, 3*16 + save_xmm128 xmm10, 4*16 + save_xmm128 xmm11, 5*16 + save_xmm128 xmm12, 6*16 + save_xmm128 xmm13, 7*16 + save_xmm128 xmm14, 8*16 + save_xmm128 xmm15, 9*16 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp + 0*16] + movdqa xmm7, [rsp + 1*16] + movdqa xmm8, [rsp + 2*16] + movdqa xmm9, [rsp + 3*16] + movdqa xmm10, [rsp + 4*16] + movdqa xmm11, [rsp + 5*16] + movdqa xmm12, [rsp + 6*16] + movdqa xmm13, [rsp + 7*16] + movdqa xmm14, [rsp + 8*16] + movdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ movdqu +%define PXOR pxor +%define AES_ENC aesenc +%define AES_ENC_LAST aesenclast + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_256_x4, function +func(aes_cbc_enc_256_x4) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm new file mode 100644 index 000000000..83e53ac11 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm @@ -0,0 +1,148 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; routine to do a 256 bit CBC AES encrypt +;; clobbers all registers except for ARG1 and rbp +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Updates In and Out pointers at end +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;void aes_cbc_enc_256_x4(void *in, +;; uint8_t *IV, +;; uint8_t keys, +;; void *out, +;; uint64_t len_bytes); +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +%ifidn __OUTPUT_FORMAT__, elf64 +%define IN0 rdi +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%define KEYS0 rdx +%define OUT0 rcx +%define func(x) x: +%define FUNC_SAVE +%define FUNC_RESTORE +%endif + +%ifidn __OUTPUT_FORMAT__, win64 +%define IN0 rcx +%define IN rcx +%define IV rdx +%define KEYS0 r8 +%define OUT0 r9 +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%define PS 8 +%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8 +%define arg(x) [rsp + stack_size + PS + PS*x] + +%define func(x) proc_frame x +%macro FUNC_SAVE 0 + alloc_stack stack_size + vmovdqa [rsp + 0*16], xmm6 + vmovdqa [rsp + 1*16], xmm7 + vmovdqa [rsp + 2*16], xmm8 + vmovdqa [rsp + 3*16], xmm9 + vmovdqa [rsp + 4*16], xmm10 + vmovdqa [rsp + 5*16], xmm11 + vmovdqa [rsp + 6*16], xmm12 + vmovdqa [rsp + 7*16], xmm13 + vmovdqa [rsp + 8*16], xmm14 + vmovdqa [rsp + 9*16], xmm15 + end_prolog + mov LEN, arg(4) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp + 0*16] + vmovdqa xmm7, [rsp + 1*16] + vmovdqa xmm8, [rsp + 2*16] + vmovdqa xmm9, [rsp + 3*16] + vmovdqa xmm10, [rsp + 4*16] + vmovdqa xmm11, [rsp + 5*16] + vmovdqa xmm12, [rsp + 6*16] + vmovdqa xmm13, [rsp + 7*16] + vmovdqa xmm14, [rsp + 8*16] + vmovdqa xmm15, [rsp + 9*16] + add rsp, stack_size +%endmacro +%endif + +%define KEY_ROUNDS 15 +%define XMM_USAGE (16) +%DEFINE UNROLLED_LOOPS (3) +%define PARALLEL_BLOCKS (UNROLLED_LOOPS) + +; instruction set specific operation definitions +%define MOVDQ vmovdqu +%macro PXOR 2 + vpxor %1, %1, %2 +%endm + +%macro AES_ENC 2 + vaesenc %1, %1, %2 +%endm + +%macro AES_ENC_LAST 2 + vaesenclast %1, %1, %2 +%endm + +%include "cbc_common.asm" + + +mk_global aes_cbc_enc_256_x8, function +func(aes_cbc_enc_256_x8) + endbranch + FUNC_SAVE + + mov IDX, 0 + + FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ + CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX + +main_loop: + CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN + jne main_loop + +done: + FUNC_RESTORE + ret + +endproc_frame diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm new file mode 100644 index 000000000..0cc09afe1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm @@ -0,0 +1,102 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +default rel +[bits 64] + +extern aes_cbc_dec_128_sse +extern aes_cbc_dec_128_avx +extern aes_cbc_dec_192_sse +extern aes_cbc_dec_192_avx +extern aes_cbc_dec_256_sse +extern aes_cbc_dec_256_avx + +extern aes_cbc_enc_128_x4 +extern aes_cbc_enc_128_x8 +extern aes_cbc_enc_192_x4 +extern aes_cbc_enc_192_x8 +extern aes_cbc_enc_256_x4 +extern aes_cbc_enc_256_x8 + +%if (AS_FEATURE_LEVEL) >= 10 +extern aes_cbc_dec_128_vaes_avx512 +extern aes_cbc_dec_192_vaes_avx512 +extern aes_cbc_dec_256_vaes_avx512 +%endif + +%include "multibinary.asm" + +;;;; +; instantiate aesni_cbc interfaces enc and dec +;;;; +mbin_interface aes_cbc_dec_128 +mbin_dispatch_init7 aes_cbc_dec_128, \ + aes_cbc_dec_128_sse, \ + aes_cbc_dec_128_sse, \ + aes_cbc_dec_128_avx, \ + aes_cbc_dec_128_avx, \ + aes_cbc_dec_128_avx, \ + aes_cbc_dec_128_vaes_avx512 + +mbin_interface aes_cbc_dec_192 +mbin_dispatch_init7 aes_cbc_dec_192, \ + aes_cbc_dec_192_sse, \ + aes_cbc_dec_192_sse, \ + aes_cbc_dec_192_avx, \ + aes_cbc_dec_192_avx, \ + aes_cbc_dec_192_avx, \ + aes_cbc_dec_192_vaes_avx512 + +mbin_interface aes_cbc_dec_256 +mbin_dispatch_init7 aes_cbc_dec_256, \ + aes_cbc_dec_256_sse, \ + aes_cbc_dec_256_sse, \ + aes_cbc_dec_256_avx, \ + aes_cbc_dec_256_avx, \ + aes_cbc_dec_256_avx, \ + aes_cbc_dec_256_vaes_avx512 + +mbin_interface aes_cbc_enc_128 +mbin_dispatch_init aes_cbc_enc_128, aes_cbc_enc_128_x4, aes_cbc_enc_128_x8, aes_cbc_enc_128_x8 +mbin_interface aes_cbc_enc_192 +mbin_dispatch_init aes_cbc_enc_192, aes_cbc_enc_192_x4, aes_cbc_enc_192_x8, aes_cbc_enc_192_x8 +mbin_interface aes_cbc_enc_256 +mbin_dispatch_init aes_cbc_enc_256, aes_cbc_enc_256_x4, aes_cbc_enc_256_x8, aes_cbc_enc_256_x8 + + + +;;; func core, ver, snum +slversion aes_cbc_enc_128, 00, 00, 0291 +slversion aes_cbc_dec_128, 00, 00, 0292 +slversion aes_cbc_enc_192, 00, 00, 0293 +slversion aes_cbc_dec_192, 00, 00, 0294 +slversion aes_cbc_enc_256, 00, 00, 0295 +slversion aes_cbc_dec_256, 00, 00, 0296 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c new file mode 100644 index 000000000..7ae5c9078 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c @@ -0,0 +1,339 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include <aes_cbc.h> +#include <test.h> +#include "ossl_helper.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +static unsigned char const ic[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f +}; + +static unsigned char *plaintext, *cbc_plaintext, *cyphertext, *ossl_plaintext, + *ossl_cyphertext; +static uint8_t test_key[CBC_256_BITS]; + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + unsigned int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int aes_128_perf(uint8_t * key) +{ + int i, ret; + + /* Initialize our cipher context, which can use same input vectors */ + uint8_t *iv = NULL; + struct cbc_key_data *key_data = NULL; + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data))); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + if ((NULL == iv) || (NULL == key_data)) + return 1; + + memcpy(iv, ic, CBC_IV_DATA_LEN); + + aes_cbc_precomp(key, 128, key_data); + aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN); + openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, + plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_128_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_128_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_dec_128(cyphertext, iv, key_data->dec_keys, + cbc_plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_128_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_128_cbc_dec(key, iv, TEST_LEN, + ossl_cyphertext, ossl_plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_128_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + printf("\n"); + return 0; +} + +int aes_192_perf(uint8_t * key) +{ + int i, ret; + uint8_t *iv = NULL; + struct cbc_key_data *key_data = NULL; + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data))); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + if ((NULL == iv) || (NULL == key_data)) + return 1; + + memcpy(iv, ic, CBC_IV_DATA_LEN); + aes_cbc_precomp(key, 192, key_data); + aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN); + openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, + cyphertext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_192_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_192_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_dec_192(cyphertext, iv, key_data->dec_keys, + cbc_plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_192_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_192_cbc_dec(key, iv, TEST_LEN, + ossl_cyphertext, ossl_plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_192_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + printf("\n"); + return 0; +} + +int aes_256_perf(uint8_t * key) +{ + int i, ret; + uint8_t *iv = NULL; + struct cbc_key_data *key_data = NULL; + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data))); + if (ret) { + printf("alloc error: Fail"); + return 1; + } + if ((NULL == iv) || (NULL == key_data)) + return 1; + + aes_cbc_precomp(key, 256, key_data); + memcpy(iv, ic, CBC_IV_DATA_LEN); + aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN); + openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, + cyphertext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_256_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_256_encode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_cbc_dec_256(cyphertext, iv, key_data->dec_keys, + cbc_plaintext, TEST_LEN); + } + + perf_stop(&stop); + printf("ISA-L__aes_cbc_256_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_cbc_dec(key, iv, TEST_LEN, + ossl_cyphertext, ossl_plaintext); + } + + perf_stop(&stop); + printf("OpenSSL_aes_cbc_256_decode" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + } + printf("\n"); + return 0; +} + +int main(void) +{ + uint32_t OK = 0; + + srand(TEST_SEED); + + plaintext = malloc(TEST_LEN); + cbc_plaintext = malloc(TEST_LEN); + cyphertext = malloc(TEST_LEN); + ossl_plaintext = malloc(TEST_LEN); + ossl_cyphertext = malloc(TEST_LEN); + if (NULL == plaintext || NULL == cyphertext || NULL == cbc_plaintext + || NULL == ossl_plaintext || NULL == ossl_cyphertext) { + printf("malloc of testsize:0x%x failed\n", TEST_LEN); + return 1; + } + + mk_rand_data(plaintext, TEST_LEN); + mk_rand_data(test_key, sizeof(test_key)); + printf("AES CBC ISA-L vs OpenSSL performance:\n"); + OK += aes_128_perf(test_key); + OK += aes_192_perf(test_key); + OK += aes_256_perf(test_key); + + return OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c new file mode 100644 index 000000000..8e8f41792 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c @@ -0,0 +1,56 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <aes_cbc.h> +#include <aes_keyexp.h> + +int aes_cbc_precomp(uint8_t * key, int key_size, struct cbc_key_data *keys_blk) +{ + if (CBC_128_BITS == key_size) { + aes_keyexp_128(key, keys_blk->enc_keys, keys_blk->dec_keys); + } else if (CBC_192_BITS == key_size) { + aes_keyexp_192(key, keys_blk->enc_keys, keys_blk->dec_keys); + } else if (CBC_256_BITS == key_size) { + aes_keyexp_256(key, keys_blk->enc_keys, keys_blk->dec_keys); + } else { + //Invalid key length + return 1; + } + return 0; +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver aes_cbc_precomp_slver_00000297; +struct slver aes_cbc_precomp_slver = { 0x0297, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h new file mode 100644 index 000000000..7bebcaed4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h @@ -0,0 +1,466 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef AES_CBC_STD_VECTORS_H_ +#define AES_CBC_STD_VECTORS_H_ +#include <aes_cbc.h> + + +// struct to hold pointers to the cbc data vectors +struct cbc_vector { + uint8_t* K; // AES Key + cbc_key_size K_LEN; // length of key in bits + uint8_t* IV; // initial value used by GCM + uint64_t P_LEN; // length of our plaintext + uint8_t* P; // Plain text + //outputs of encryption + uint8_t* EXP_C; // same length as P + // used in vector checks, not populated in std vector array + uint8_t *C; + struct cbc_key_data *KEYS; +}; + + +/////////////////////////////////////////// +// Test vectors from: +// Intel IPSec library 1..3 +// +/////////////////////////////////////////// +static unsigned char K1[] = { + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c +}; +static unsigned char IV1[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; +static unsigned char P1[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a, + 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, + 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef, + 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 +}; +static unsigned char C1[] = { + 0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d, + 0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2, + 0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16, + 0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7 +}; + +static unsigned char K2[] = { + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4 +}; +static unsigned char IV2[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; +static unsigned char P2[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a, + 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, + 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef, + 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 +}; +static unsigned char C2[] = { + 0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6, + 0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d, + 0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61, + 0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b +}; + +static unsigned char K3[] = { + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7 +}; +static unsigned char IV3[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +}; +static unsigned char P3[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a, + 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, + 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef, + 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10 +}; +static unsigned char C3[] = { + 0x17, 0x70, 0x1a, 0x9d, 0x29, 0xc9, 0x1a, 0x94, 0xce, 0xed, 0x72, 0x3c, 0x34, 0xe8, + 0x7a, 0xbe, 0x1c, 0x96, 0x84, 0x5c, 0xa8, 0xb7, 0xe8, 0x58, 0x6d, 0xfe, 0xf2, 0xfa, + 0x6b, 0xed, 0x24, 0x09, 0x8a, 0x52, 0xce, 0xe8, 0xd7, 0x6d, 0xb6, 0x7b, 0xfd, 0xe2, + 0x15, 0x53, 0xd3, 0x1c, 0x28, 0x33, 0xf7, 0x7e, 0xb5, 0x95, 0x00, 0xac, 0x49, 0x03, + 0xbc, 0x70, 0x76, 0xb1, 0x84, 0x65, 0xd0, 0xea +}; + +/////////////////////////////////////////// +// Test vectors from: +// 'https://tools.ietf.org/html/rfc3602#section-3.2' +// The AES-CBC Cipher Algorithm and Its Use with IPsec +// +/////////////////////////////////////////// +/* +Case #1: Encrypting 16 bytes (1 block) using AES-CBC with 128-bit key +Key : 0x06a9214036b8a15b512e03d534120006 +IV : 0x3dafba429d9eb430b422da802c9fac41 +Plaintext : "Single block msg" +Ciphertext: 0xe353779c1079aeb82708942dbe77181a + * + */ +static unsigned char K4[] = { + 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b, 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06 +}; +static unsigned char IV4[] = { + 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30, 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41 +}; +static unsigned char P4[] = { + "Single block msg" +}; +static unsigned char C4[] = { + 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8, 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a +}; + +/* +Case #2: Encrypting 32 bytes (2 blocks) using AES-CBC with 128-bit key +Key : 0xc286696d887c9aa0611bbb3e2025a45a +IV : 0x562e17996d093d28ddb3ba695a2e6f58 +Plaintext : 0x000102030405060708090a0b0c0d0e0f + 101112131415161718191a1b1c1d1e1f +Ciphertext: 0xd296cd94c2cccf8a3a863028b5e1dc0a + 7586602d253cfff91b8266bea6d61ab1 +*/ +static unsigned char K5[] = { + 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0, 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a +}; +static unsigned char IV5[] = { + 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28, 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58 +}; +static unsigned char P5[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, + 0x1c, 0x1d, 0x1e, 0x1f +}; +static unsigned char C5[] = { + 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a, 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1, + 0xdc, 0x0a, 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9, 0x1b, 0x82, 0x66, 0xbe, + 0xa6, 0xd6, 0x1a, 0xb1 +}; + +/* +Case #3: Encrypting 48 bytes (3 blocks) using AES-CBC with 128-bit key +Key : 0x6c3ea0477630ce21a2ce334aa746c2cd +IV : 0xc782dc4c098c66cbd9cd27d825682c81 +Plaintext : "This is a 48-byte message (exactly 3 AES blocks)" +Ciphertext: 0xd0a02b3836451753d493665d33f0e886 + 2dea54cdb293abc7506939276772f8d5 + 021c19216bad525c8579695d83ba2684 + + */ +static unsigned char K6[] = { + 0x6c, 0x3e, 0xa0, 0x47, 0x76, 0x30, 0xce, 0x21, 0xa2, 0xce, 0x33, 0x4a, 0xa7, 0x46, 0xc2, 0xcd +}; +static unsigned char IV6[] = { + 0xc7, 0x82, 0xdc, 0x4c, 0x09, 0x8c, 0x66, 0xcb, 0xd9, 0xcd, 0x27, 0xd8, 0x25, 0x68, 0x2c, 0x81 +}; +static unsigned char P6[] = { + "This is a 48-byte message (exactly 3 AES blocks)" +}; +static unsigned char C6[] = { + 0xd0, 0xa0, 0x2b, 0x38, 0x36, 0x45, 0x17, 0x53, 0xd4, 0x93, 0x66, 0x5d, 0x33, 0xf0, 0xe8, 0x86, + 0x2d, 0xea, 0x54, 0xcd, 0xb2, 0x93, 0xab, 0xc7, 0x50, 0x69, 0x39, 0x27, 0x67, 0x72, 0xf8, 0xd5, + 0x02, 0x1c, 0x19, 0x21, 0x6b, 0xad, 0x52, 0x5c, 0x85, 0x79, 0x69, 0x5d, 0x83, 0xba, 0x26, 0x84 +}; + +/* +Case #4: Encrypting 64 bytes (4 blocks) using AES-CBC with 128-bit key +Key : 0x56e47a38c5598974bc46903dba290349 +IV : 0x8ce82eefbea0da3c44699ed7db51b7d9 +Plaintext : 0xa0a1a2a3a4a5a6a7a8a9aaabacadaeaf + b0b1b2b3b4b5b6b7b8b9babbbcbdbebf + c0c1c2c3c4c5c6c7c8c9cacbcccdcecf + d0d1d2d3d4d5d6d7d8d9dadbdcdddedf +Ciphertext: 0xc30e32ffedc0774e6aff6af0869f71aa + 0f3af07a9a31a9c684db207eb0ef8e4e + 35907aa632c3ffdf868bb7b29d3d46ad + 83ce9f9a102ee99d49a53e87f4c3da55 + */ +static unsigned char K7[] = { + 0x56, 0xe4, 0x7a, 0x38, 0xc5, 0x59, 0x89, 0x74, 0xbc, 0x46, 0x90, 0x3d, 0xba, 0x29, 0x03, 0x49 +}; +static unsigned char IV7[] = { + 0x8c, 0xe8, 0x2e, 0xef, 0xbe, 0xa0, 0xda, 0x3c, 0x44, 0x69, 0x9e, 0xd7, 0xdb, 0x51, 0xb7, 0xd9 +}; +static unsigned char P7[] = { + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf +}; +static unsigned char C7[] = { + 0xc3, 0x0e, 0x32, 0xff, 0xed, 0xc0, 0x77, 0x4e, 0x6a, 0xff, 0x6a, 0xf0, 0x86, 0x9f, 0x71, 0xaa, + 0x0f, 0x3a, 0xf0, 0x7a, 0x9a, 0x31, 0xa9, 0xc6, 0x84, 0xdb, 0x20, 0x7e, 0xb0, 0xef, 0x8e, 0x4e, + 0x35, 0x90, 0x7a, 0xa6, 0x32, 0xc3, 0xff, 0xdf, 0x86, 0x8b, 0xb7, 0xb2, 0x9d, 0x3d, 0x46, 0xad, + 0x83, 0xce, 0x9f, 0x9a, 0x10, 0x2e, 0xe9, 0x9d, 0x49, 0xa5, 0x3e, 0x87, 0xf4, 0xc3, 0xda, 0x55 +}; + +/* +Case #5: Sample transport-mode ESP packet (ping 192.168.123.100) +Key: 90d382b4 10eeba7a d938c46c ec1a82bf +SPI: 4321 +Source address: 192.168.123.3 +Destination address: 192.168.123.100 +Sequence number: 1 +IV: e96e8c08 ab465763 fd098d45 dd3ff893 + +Original packet: +IP header (20 bytes): 45000054 08f20000 4001f9fe c0a87b03 c0a87b64 +Data (64 bytes): +08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617 +18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637 + +Augment data with: +Padding: 01020304 05060708 090a0b0c 0d0e +Pad length: 0e +Next header: 01 (ICMP) + +Pre-encryption Data with padding, pad length and next header (80 bytes): +08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617 +18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637 +01020304 05060708 090a0b0c 0d0e0e01 + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500007c 08f20000 4032f9a5 c0a87b03 c0a87b64 +SPI/Seq #: 00004321 00000001 +IV: e96e8c08 ab465763 fd098d45 dd3ff893 +Encrypted Data (80 bytes): +f663c25d 325c18c6 a9453e19 4e120849 a4870b66 cc6b9965 330013b4 898dc856 +a4699e52 3a55db08 0b59ec3a 8e4b7e52 775b07d1 db34ed9c 538ab50c 551b874a +a269add0 47ad2d59 13ac19b7 cfbad4a6 +*/ +static unsigned char K8[] = { + 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf +}; +static unsigned char IV8[] = { + 0xe9, 0x6e, 0x8c, 0x08, 0xab, 0x46, 0x57, 0x63, 0xfd, 0x09, 0x8d, 0x45, 0xdd, 0x3f, 0xf8, 0x93 +}; +static unsigned char P8[] = { + 0x08, 0x00, 0x0e, 0xbd, 0xa7, 0x0a, 0x00, 0x00, 0x8e, 0x9c, 0x08, 0x3d, 0xb9, 0x5b, 0x07, 0x00, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x01 +}; +static unsigned char C8[] = { + 0xf6, 0x63, 0xc2, 0x5d, 0x32, 0x5c, 0x18, 0xc6, 0xa9, 0x45, 0x3e, 0x19, 0x4e, 0x12, 0x08, 0x49, + 0xa4, 0x87, 0x0b, 0x66, 0xcc, 0x6b, 0x99, 0x65, 0x33, 0x00, 0x13, 0xb4, 0x89, 0x8d, 0xc8, 0x56, + 0xa4, 0x69, 0x9e, 0x52, 0x3a, 0x55, 0xdb, 0x08, 0x0b, 0x59, 0xec, 0x3a, 0x8e, 0x4b, 0x7e, 0x52, + 0x77, 0x5b, 0x07, 0xd1, 0xdb, 0x34, 0xed, 0x9c, 0x53, 0x8a, 0xb5, 0x0c, 0x55, 0x1b, 0x87, 0x4a, + 0xa2, 0x69, 0xad, 0xd0, 0x47, 0xad, 0x2d, 0x59, 0x13, 0xac, 0x19, 0xb7, 0xcf, 0xba, 0xd4, 0xa6 +}; + +/* +Case #6: Sample transport-mode ESP packet + (ping -p 77 -s 20 192.168.123.100) +Key: 90d382b4 10eeba7a d938c46c ec1a82bf +SPI: 4321 +Source address: 192.168.123.3 +Destination address: 192.168.123.100 +Sequence number: 8 +IV: 69d08df7 d203329d b093fc49 24e5bd80 + +Original packet: +IP header (20 bytes): 45000030 08fe0000 4001fa16 c0a87b03 c0a87b64 +Data (28 bytes): +0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 + +Augment data with: +Padding: 0102 +Pad length: 02 +Next header: 01 (ICMP) + +Pre-encryption Data with padding, pad length and next header (32 bytes): +0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 01020201 + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500004c 08fe0000 4032f9c9 c0a87b03 c0a87b64 +SPI/Seq #: 00004321 00000008 +IV: 69d08df7 d203329d b093fc49 24e5bd80 +Encrypted Data (32 bytes): +f5199588 1ec4e0c4 488987ce 742e8109 689bb379 d2d750c0 d915dca3 46a89f75 + */ +static unsigned char K9[] = { + 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf +}; +static unsigned char IV9[] = { + 0x69, 0xd0, 0x8d, 0xf7, 0xd2, 0x03, 0x32, 0x9d, 0xb0, 0x93, 0xfc, 0x49, 0x24, 0xe5, 0xbd, 0x80 +}; +static unsigned char P9[] = { + 0x08, 0x00, 0xb5, 0xe8, 0xa8, 0x0a, 0x05, 0x00, 0xa6, 0x9c, 0x08, 0x3d, 0x0b, 0x66, 0x0e, 0x00, + 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x01, 0x02, 0x02, 0x01 +}; +static unsigned char C9[] = { + 0xf5, 0x19, 0x95, 0x88, 0x1e, 0xc4, 0xe0, 0xc4, 0x48, 0x89, 0x87, 0xce, 0x74, 0x2e, 0x81, 0x09, + 0x68, 0x9b, 0xb3, 0x79, 0xd2, 0xd7, 0x50, 0xc0, 0xd9, 0x15, 0xdc, 0xa3, 0x46, 0xa8, 0x9f, 0x75 +}; + +/* +Case #7: Sample tunnel-mode ESP packet (ping 192.168.123.200) +Key: 01234567 89abcdef 01234567 89abcdef +SPI: 8765 +Source address: 192.168.123.3 +Destination address: 192.168.123.200 +Sequence number: 2 +IV: f4e76524 4f6407ad f13dc138 0f673f37 + +Original packet: +IP header (20 bytes): 45000054 09040000 4001f988 c0a87b03 c0a87bc8 +Data (64 bytes): +08009f76 a90a0100 b49c083d 02a20400 08090a0b 0c0d0e0f 10111213 14151617 +18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637 + +Augment data with: +Padding: 01020304 05060708 090a +Pad length: 0a +Next header: 04 (IP-in-IP) + +Pre-encryption Data with original IP header, padding, pad length and + next header (96 bytes): +45000054 09040000 4001f988 c0a87b03 c0a87bc8 08009f76 a90a0100 b49c083d +02a20400 08090a0b 0c0d0e0f 10111213 14151617 18191a1b 1c1d1e1f 20212223 +24252627 28292a2b 2c2d2e2f 30313233 34353637 01020304 05060708 090a0a04 + + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500008c 09050000 4032f91e c0a87b03 c0a87bc8 +SPI/Seq #: 00008765 00000002 +IV: f4e76524 4f6407ad f13dc138 0f673f37 +Encrypted Data (96 bytes): +773b5241 a4c44922 5e4f3ce5 ed611b0c 237ca96c f74a9301 3c1b0ea1 a0cf70f8 +e4ecaec7 8ac53aad 7a0f022b 859243c6 47752e94 a859352b 8a4d4d2d ecd136e5 +c177f132 ad3fbfb2 201ac990 4c74ee0a 109e0ca1 e4dfe9d5 a100b842 f1c22f0d + */ +static unsigned char K10[] = { + 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef +}; +static unsigned char IV10[] = { + 0xf4, 0xe7, 0x65, 0x24, 0x4f, 0x64, 0x07, 0xad, 0xf1, 0x3d, 0xc1, 0x38, 0x0f, 0x67, 0x3f, 0x37 +}; +static unsigned char P10[] = { + 0x45, 0x00, 0x00, 0x54, 0x09, 0x04, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x88, 0xc0, 0xa8, 0x7b, 0x03, + 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0x9f, 0x76, 0xa9, 0x0a, 0x01, 0x00, 0xb4, 0x9c, 0x08, 0x3d, + 0x02, 0xa2, 0x04, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, + 0x34, 0x35, 0x36, 0x37, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04 + +}; +static unsigned char C10[] = { + 0x77, 0x3b, 0x52, 0x41, 0xa4, 0xc4, 0x49, 0x22, 0x5e, 0x4f, 0x3c, 0xe5, 0xed, 0x61, 0x1b, 0x0c, + 0x23, 0x7c, 0xa9, 0x6c, 0xf7, 0x4a, 0x93, 0x01, 0x3c, 0x1b, 0x0e, 0xa1, 0xa0, 0xcf, 0x70, 0xf8, + 0xe4, 0xec, 0xae, 0xc7, 0x8a, 0xc5, 0x3a, 0xad, 0x7a, 0x0f, 0x02, 0x2b, 0x85, 0x92, 0x43, 0xc6, + 0x47, 0x75, 0x2e, 0x94, 0xa8, 0x59, 0x35, 0x2b, 0x8a, 0x4d, 0x4d, 0x2d, 0xec, 0xd1, 0x36, 0xe5, + 0xc1, 0x77, 0xf1, 0x32, 0xad, 0x3f, 0xbf, 0xb2, 0x20, 0x1a, 0xc9, 0x90, 0x4c, 0x74, 0xee, 0x0a, + 0x10, 0x9e, 0x0c, 0xa1, 0xe4, 0xdf, 0xe9, 0xd5, 0xa1, 0x00, 0xb8, 0x42, 0xf1, 0xc2, 0x2f, 0x0d +}; + +/* +Case #8: Sample tunnel-mode ESP packet + (ping -p ff -s 40 192.168.123.200) +Key: 01234567 89abcdef 01234567 89abcdef +SPI: 8765 +Source address: 192.168.123.3 +Destination address: 192.168.123.200 +Sequence number: 5 +IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22 + +Original packet: +IP header (20 bytes): 45000044 090c0000 4001f990 c0a87b03 c0a87bc8 +Data (48 bytes): +0800d63c aa0a0200 c69c083d a3de0300 ffffffff ffffffff ffffffff ffffffff +ffffffff ffffffff ffffffff ffffffff + +Augment data with: +Padding: 01020304 05060708 090a +Pad length: 0a +Next header: 04 (IP-in-IP) + +Pre-encryption Data with original IP header, padding, pad length and + next header (80 bytes): +45000044 090c0000 4001f990 c0a87b03 c0a87bc8 0800d63c aa0a0200 c69c083d +a3de0300 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff +ffffffff 01020304 05060708 090a0a04 + +Post-encryption packet with SPI, Sequence number, IV: +IP header: 4500007c 090d0000 4032f926 c0a87b03 c0a87bc8 +SPI/Seq #: 00008765 00000005 +IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22 +Encrypted Data (80 bytes): +15b92683 819596a8 047232cc 00f7048f e45318e1 1f8a0f62 ede3c3fc 61203bb5 +0f980a08 c9843fd3 a1b06d5c 07ff9639 b7eb7dfb 3512e5de 435e7207 ed971ef3 +d2726d9b 5ef6affc 6d17a0de cbb13892 + */ +static unsigned char K11[] = { + 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef +}; +static unsigned char IV11[] = { + 0x85, 0xd4, 0x72, 0x24, 0xb5, 0xf3, 0xdd, 0x5d, 0x21, 0x01, 0xd4, 0xea, 0x8d, 0xff, 0xab, 0x22 +}; +static unsigned char P11[] = { + 0x45, 0x00, 0x00, 0x44, 0x09, 0x0c, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x90, 0xc0, 0xa8, 0x7b, 0x03, + 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0xd6, 0x3c, 0xaa, 0x0a, 0x02, 0x00, 0xc6, 0x9c, 0x08, 0x3d, + 0xa3, 0xde, 0x03, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04 +}; +static unsigned char C11[] = { + 0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0x04, 0x72, 0x32, 0xcc, 0x00, 0xf7, 0x04, 0x8f, + 0xe4, 0x53, 0x18, 0xe1, 0x1f, 0x8a, 0x0f, 0x62, 0xed, 0xe3, 0xc3, 0xfc, 0x61, 0x20, 0x3b, 0xb5, + 0x0f, 0x98, 0x0a, 0x08, 0xc9, 0x84, 0x3f, 0xd3, 0xa1, 0xb0, 0x6d, 0x5c, 0x07, 0xff, 0x96, 0x39, + 0xb7, 0xeb, 0x7d, 0xfb, 0x35, 0x12, 0xe5, 0xde, 0x43, 0x5e, 0x72, 0x07, 0xed, 0x97, 0x1e, 0xf3, + 0xd2, 0x72, 0x6d, 0x9b, 0x5e, 0xf6, 0xaf, 0xfc, 0x6d, 0x17, 0xa0, 0xde, 0xcb, 0xb1, 0x38, 0x92 +}; + + +#define min_size(a, b) (((a)<(b))?(a):(b)) +// Plain and cypher text will be the same size +// Those vectors using strings for plain text have an extra null terminator that needs +// to be ignored +#define vect_size(P, C) (min_size((sizeof(P)),(sizeof(C)))) +#define CBC_KEY_LEN(kdata) (sizeof(kdata)) + +//field order {K, Klen, IV, Plen, P, C}; +#define vector(N) {K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N,C##N), P##N, C##N, NULL, NULL, /*NULL, NULL*/} +struct cbc_vector const cbc_vectors[] = { + vector(1), + vector(2), + vector(3), + vector(4), + vector(5), + vector(6), + vector(7), + vector(8), + vector(9), + vector(10), + vector(11), +}; + +#endif /* AES_CBC_STD_VECTORS_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c new file mode 100644 index 000000000..aa9412c35 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c @@ -0,0 +1,443 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <aes_cbc.h> +#include "types.h" +#include "ossl_helper.h" +#include "cbc_std_vectors.h" + +//define CBC_VECTORS_VERBOSE +//define CBC_VECTORS_EXTRA_VERBOSE + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 100 +#endif +#ifndef TEST_LEN +# define TEST_LEN (8*1024*1024) +#endif +#ifndef PAGE_LEN +# define PAGE_LEN (4*1024) +#endif +#ifndef MAX_UNALINED +# define MAX_UNALINED (16) +#endif + +static cbc_key_size const Ksize[] = { CBC_128_BITS, CBC_192_BITS, CBC_256_BITS }; + +typedef void (*aes_cbc_generic)(uint8_t * in, + uint8_t * IV, + uint8_t * keys, uint8_t * out, uint64_t len_bytes); + +int OpenSslEnc(uint8_t k_len, + uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes) +{ + if (CBC_128_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL128 "); +#endif + openssl_aes_128_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_192_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL192 "); +#endif + openssl_aes_192_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_256_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL256 "); + fflush(0); +#endif + openssl_aes_256_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out); + } else { + fprintf(stderr, "Invalid key length: %d\n", k_len); + return 1; + } + return 0; +} + +int OpenSslDec(uint8_t k_len, + uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes) +{ + if (CBC_128_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL128 "); +#endif + openssl_aes_128_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_192_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL192 "); +#endif + openssl_aes_192_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out); + } else if (CBC_256_BITS == k_len) { +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" OpenSSL256 "); +#endif + openssl_aes_256_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out); + } else { + fprintf(stderr, "Invalid key length: %d\n", k_len); + return 1; + } + return 0; +} + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + uint64_t a; + + mismatch = memcmp(test, expected, len); + if (!mismatch) { + return OK; + + } else { + OK = 1; + printf(" failed %s \t\t", data_name); + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + return OK; +} + +int check_vector(struct cbc_vector *vector) +{ + uint8_t *pt_test = NULL; + uint8_t *o_ct_test = NULL; + int OK = 0; + aes_cbc_generic enc; + aes_cbc_generic dec; + +#ifdef CBC_VECTORS_VERBOSE + printf(" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN); +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C, + vector->IV, vector->EXP_C, vector->KEYS); +#endif + fflush(0); +#else + printf("."); +#endif + + if (CBC_128_BITS == vector->K_LEN) { + enc = (aes_cbc_generic) & aes_cbc_enc_128; + dec = (aes_cbc_generic) & aes_cbc_dec_128; +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" CBC128 "); +#endif + } else if (CBC_192_BITS == vector->K_LEN) { + enc = (aes_cbc_generic) & aes_cbc_enc_192; + dec = (aes_cbc_generic) & aes_cbc_dec_192; +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" CBC192 "); +#endif + } else if (CBC_256_BITS == vector->K_LEN) { + enc = (aes_cbc_generic) & aes_cbc_enc_256; + dec = (aes_cbc_generic) & aes_cbc_dec_256; +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" CBC256 "); +#endif + } else { + printf("Invalid key length: %d\n", vector->K_LEN); + return 1; + } + + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->P_LEN); + o_ct_test = malloc(vector->P_LEN); + if ((pt_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + + aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS); + +#ifdef CBC_VECTORS_VERBOSE + fflush(0); +#endif + //// + // ISA-l Encrypt + //// + enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN); + if (NULL != vector->EXP_C) { //when the encrypted text is know verify correct + OK |= + check_data(vector->EXP_C, vector->C, vector->P_LEN, + "ISA-L expected cypher text (C)"); + } + OpenSslEnc(vector->K_LEN, vector->K, vector->P, vector->IV, o_ct_test, vector->P_LEN); + OK |= + check_data(vector->C, o_ct_test, vector->P_LEN, + "OpenSSL vs ISA-L cypher text (C)"); + + memcpy(pt_test, vector->P, vector->P_LEN); + memset(vector->P, 0, vector->P_LEN); +#ifdef CBC_VECTORS_VERBOSE + fflush(0); +#endif + + //// + // ISA-l Decrypt + //// + dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->P_LEN); + dec(o_ct_test, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted OpenSSL (P)"); + memset(vector->P, 0, vector->P_LEN); + OpenSslDec(vector->K_LEN, vector->K, vector->C, vector->IV, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "OpenSSL decrypted ISA-L (P)"); +#ifdef CBC_VECTORS_VERBOSE + if (OK) + printf("Failed"); + else + printf("Passed"); + + printf("\n"); +#endif + + return OK; +} + +int test_std_combinations(void) +{ + int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]); + int i, ret; + uint8_t *iv = NULL; + + printf("AES CBC standard test vectors:"); +#ifdef CBC_VECTORS_VERBOSE + printf("\n"); +#endif + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if ((0 != ret) || (NULL == iv)) + return 1; + + for (i = 0; (i < vectors_cnt); i++) { + struct cbc_vector vect = cbc_vectors[i]; + + ret = posix_memalign((void **)&vect.KEYS, 16, (sizeof(*vect.KEYS))); + if ((0 != ret) || (NULL == vect.KEYS)) + return 1; + // IV data must be aligned to 16 byte boundary so move data in aligned buffer and change out the pointer + memcpy(iv, vect.IV, CBC_IV_DATA_LEN); + vect.IV = iv; + vect.C = NULL; + vect.C = malloc(vect.P_LEN); + if ((NULL == vect.C)) + return 1; +#ifdef CBC_VECTORS_VERBOSE + printf("vector[%d of %d] ", i, vectors_cnt); +#endif + if (0 == (i % 25)) + printf("\n"); + if (0 == (i % 10)) + fflush(0); + + if (0 != check_vector(&vect)) + return 1; + + aligned_free(vect.KEYS); + free(vect.C); + } + + aligned_free(iv); + printf("\n"); + return 0; +} + +int test_random_combinations(void) +{ + struct cbc_vector test; + int t, ret; + + printf("AES CBC random test vectors:"); +#ifdef CBC_VECTORS_VERBOSE + fflush(0); +#endif + test.IV = NULL; + ret = posix_memalign((void **)&test.IV, 16, (CBC_IV_DATA_LEN)); + if ((0 != ret) || (NULL == test.IV)) + return 1; + test.KEYS = NULL; + ret = posix_memalign((void **)&test.KEYS, 16, (sizeof(*test.KEYS))); + if ((0 != ret) || (NULL == test.KEYS)) + return 1; + + for (t = 0; RANDOMS > t; t++) { + int Plen = 16 + ((rand() % TEST_LEN) & ~0xf); //must be a 16byte multiple + int offset = (rand() % MAX_UNALINED); + int Kindex = (rand() % (sizeof(Ksize) / sizeof(Ksize[0]))); // select one of the valid key sizes + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + + test.C = NULL; + test.P = NULL; + test.K = NULL; + test.EXP_C = NULL; + test.P_LEN = Plen; + test.K_LEN = Ksize[Kindex]; + + test.P = malloc(test.P_LEN + offset); + test.C = malloc(test.P_LEN + offset); + test.K = malloc(test.K_LEN + offset); + if ((NULL == test.P) || (NULL == test.C) || (NULL == test.K)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return -1; + } + test.P += offset; + test.C += offset; + test.K += offset; + + mk_rand_data(test.P, test.P_LEN); + mk_rand_data(test.K, test.K_LEN); + mk_rand_data(test.IV, CBC_IV_DATA_LEN); + +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" Offset:0x%x ", offset); +#endif + if (0 != check_vector(&test)) + return 1; + + test.C -= offset; + free(test.C); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + } + + aligned_free(test.IV); + aligned_free(test.KEYS); + printf("\n"); + return 0; +} + +int test_efence_combinations(void) +{ + struct cbc_vector test; + int offset = 0; + int key_idx; + uint8_t *P = NULL, *C = NULL, *K = NULL, *IV = NULL; + uint8_t *key_data = NULL; + + P = malloc(PAGE_LEN); + C = malloc(PAGE_LEN); + K = malloc(PAGE_LEN); + IV = malloc(PAGE_LEN); + key_data = malloc(PAGE_LEN); + + if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) + || (NULL == key_data) + ) { + printf("malloc of testsize:0x%x failed\n", PAGE_LEN); + return -1; + } + // place buffers to end at page boundary + test.P_LEN = PAGE_LEN / 2; + test.EXP_C = NULL; + + printf("AES CBC efence test vectors:"); + for (key_idx = 0; key_idx < (sizeof(Ksize) / sizeof(Ksize[0])); key_idx++) { + test.K_LEN = Ksize[key_idx]; + + for (offset = 0; MAX_UNALINED > offset; offset++) { + if (0 == (offset % 80)) + printf("\n"); + // move the start and size of the data block towards the end of the page + test.P_LEN = ((PAGE_LEN / (1 + (2 * offset))) & ~0xff); // must be a multiple of 16 + if (16 > test.P_LEN) + test.P_LEN = 16; + //Place data at end of page + test.P = P + PAGE_LEN - test.P_LEN - offset; + test.C = C + PAGE_LEN - test.P_LEN - offset; + test.K = K + PAGE_LEN - test.K_LEN - offset; + test.IV = IV + PAGE_LEN - CBC_IV_DATA_LEN - offset; + test.IV = test.IV - ((uint64_t) test.IV & 0xff); // align to 16 byte boundary + test.KEYS = (struct cbc_key_data *) + (key_data + PAGE_LEN - sizeof(*test.KEYS) - offset); + test.KEYS = (struct cbc_key_data *) + ((uint8_t *) test.KEYS - ((uint64_t) test.KEYS & 0xff)); // align to 16 byte boundary + + mk_rand_data(test.P, test.P_LEN); + mk_rand_data(test.K, test.K_LEN); + mk_rand_data(test.IV, CBC_IV_DATA_LEN); +#ifdef CBC_VECTORS_EXTRA_VERBOSE + printf(" Offset:0x%x ", offset); +#endif + if (0 != check_vector(&test)) + return 1; + } + + } + + free(P); + free(C); + free(K); + free(IV); + free(key_data); + printf("\n"); + return 0; +} + +int main(void) +{ + uint32_t OK = 0; + + srand(TEST_SEED); + OK |= test_std_combinations(); + OK |= test_random_combinations(); + OK |= test_efence_combinations(); + if (0 == OK) { + printf("...Pass\n"); + } else { + printf("...Fail\n"); + } + return OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c new file mode 100644 index 000000000..0558b4254 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c @@ -0,0 +1,183 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +/* + * Run list of standard CBC test vectors through encode and decode checks. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <aes_cbc.h> +#include "types.h" +#include "cbc_std_vectors.h" + +typedef void (*aes_cbc_generic)(uint8_t * in, uint8_t * IV, uint8_t * keys, uint8_t * out, + uint64_t len_bytes); + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + uint64_t a; + + mismatch = memcmp(test, expected, len); + if (!mismatch) { + return OK; + + } else { + OK = 1; + printf(" failed %s \t\t", data_name); + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + return OK; +} + +int check_vector(struct cbc_vector *vector) +{ + uint8_t *pt_test = NULL; + int OK = 0; + aes_cbc_generic enc; + aes_cbc_generic dec; + + DEBUG_PRINT((" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN)); + DEBUG_PRINT((" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C, + vector->IV, vector->EXP_C, vector->KEYS)); + printf("."); + + switch (vector->K_LEN) { + case CBC_128_BITS: + enc = (aes_cbc_generic) & aes_cbc_enc_128; + dec = (aes_cbc_generic) & aes_cbc_dec_128; + DEBUG_PRINT((" CBC128 ")); + break; + case CBC_192_BITS: + enc = (aes_cbc_generic) & aes_cbc_enc_192; + dec = (aes_cbc_generic) & aes_cbc_dec_192; + DEBUG_PRINT((" CBC192 ")); + break; + case CBC_256_BITS: + enc = (aes_cbc_generic) & aes_cbc_enc_256; + dec = (aes_cbc_generic) & aes_cbc_dec_256; + DEBUG_PRINT((" CBC256 ")); + break; + default: + printf("Invalid key length: %d\n", vector->K_LEN); + return 1; + } + + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->P_LEN); + + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + + aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS); + + //// + // ISA-l Encrypt + //// + enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN); + + if (NULL != vector->EXP_C) { //when the encrypted text is known verify correct + OK |= check_data(vector->EXP_C, vector->C, vector->P_LEN, + "ISA-L expected cypher text (C)"); + } + memcpy(pt_test, vector->P, vector->P_LEN); + memset(vector->P, 0, vector->P_LEN); + + //// + // ISA-l Decrypt + //// + dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN); + OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)"); + DEBUG_PRINT((OK ? "Failed\n" : "Passed\n")); + + free(pt_test); + return OK; +} + +int test_std_combinations(void) +{ + int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]); + int i, ret; + uint8_t *iv = NULL; + + printf("AES CBC standard test vectors: "); + + ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN)); + if ((0 != ret) || (NULL == iv)) + return 1; + + for (i = 0; (i < vectors_cnt); i++) { + struct cbc_vector vect = cbc_vectors[i]; + + ret = posix_memalign((void **)&(vect.KEYS), 16, sizeof(*vect.KEYS)); + if ((0 != ret) || (NULL == vect.KEYS)) + return 1; + + // IV data must be aligned to 16 byte boundary so move data in + // aligned buffer and change out the pointer + memcpy(iv, vect.IV, CBC_IV_DATA_LEN); + vect.IV = iv; + vect.C = malloc(vect.P_LEN); + if (NULL == vect.C) + return 1; + + DEBUG_PRINT(("vector[%d of %d] ", i, vectors_cnt)); + + if (0 != check_vector(&vect)) + return 1; + + aligned_free(vect.KEYS); + free(vect.C); + } + + aligned_free(iv); + return 0; +} + +int main(void) +{ + uint32_t OK = 0; + + OK = test_std_combinations(); + + printf(0 == OK ? "Pass\n" : "Fail\n"); + return OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm new file mode 100644 index 000000000..2c80401e9 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm @@ -0,0 +1,202 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef _CLEAR_REGS_ASM_ +%define _CLEAR_REGS_ASM_ + +%ifndef LINUX +%ifidn __OUTPUT_FORMAT__, elf64 +%define LINUX +%endif +%endif + +; +; This macro clears any GP registers passed +; +%macro clear_gps 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + xor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any XMM registers passed on SSE +; +%macro clear_xmms_sse 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + pxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any XMM registers passed on AVX +; +%macro clear_xmms_avx 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any YMM registers passed +; +%macro clear_ymms 1-16 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxor %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears any ZMM registers passed +; +%macro clear_zmms 1-32 +%define %%NUM_REGS %0 +%rep %%NUM_REGS + vpxorq %1, %1 +%rotate 1 +%endrep +%endmacro + +; +; This macro clears all scratch GP registers +; for Windows or Linux +; +%macro clear_scratch_gps_asm 0 + clear_gps rax, rcx, rdx, r8, r9, r10, r11 +%ifdef LINUX + clear_gps rdi, rsi +%endif +%endmacro + +; +; This macro clears all scratch XMM registers on SSE +; +%macro clear_scratch_xmms_sse_asm 0 +%ifdef LINUX +%assign i 0 +%rep 16 + pxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +; On Windows, XMM0-XMM5 registers are scratch registers +%else +%assign i 0 +%rep 6 + pxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch XMM registers on AVX +; +%macro clear_scratch_xmms_avx_asm 0 +%ifdef LINUX + vzeroall +; On Windows, XMM0-XMM5 registers are scratch registers +%else +%assign i 0 +%rep 6 + vpxor xmm %+ i, xmm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch YMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15) +; +%macro clear_scratch_ymms_asm 0 +; On Linux, all YMM registers are scratch registers +%ifdef LINUX + vzeroall +; On Windows, YMM0-YMM5 registers are scratch registers. +; YMM6-YMM15 upper 128 bits are scratch registers too, but +; the lower 128 bits are to be restored after calling these function +; which clears the upper bits too. +%else +%assign i 0 +%rep 6 + vpxor ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +; +; This macro clears all scratch ZMM registers +; +; It should be called before restoring the XMM registers +; for Windows (XMM6-XMM15). YMM registers are used +; on purpose, since XOR'ing YMM registers is faster +; than XOR'ing ZMM registers, and the operation clears +; also the upper 256 bits +; +%macro clear_scratch_zmms_asm 0 +; On Linux, all ZMM registers are scratch registers +%ifdef LINUX + vzeroall + ;; vzeroall only clears the first 16 ZMM registers +%assign i 16 +%rep 16 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers. +; ZMM6-ZMM15 upper 384 bits are scratch registers too, but +; the lower 128 bits are to be restored after calling these function +; which clears the upper bits too. +%else +%assign i 0 +%rep 6 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep + +%assign i 16 +%rep 16 + vpxorq ymm %+ i, ymm %+ i +%assign i (i+1) +%endrep +%endif ; LINUX +%endmacro + +%endif ;; _CLEAR_REGS_ASM diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm new file mode 100644 index 000000000..98304c552 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm new file mode 100644 index 000000000..5ee5e7b48 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm new file mode 100644 index 000000000..902c17237 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm new file mode 100644 index 000000000..1e55d24cf --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm new file mode 100644 index 000000000..1717a8662 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm new file mode 100644 index 000000000..d17402bea --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm new file mode 100644 index 000000000..71f284789 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm @@ -0,0 +1,32 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +;; single buffer implementation +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm new file mode 100644 index 000000000..c0c587133 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm new file mode 100644 index 000000000..4b159cefb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm new file mode 100644 index 000000000..822ef07cc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen2.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm new file mode 100644 index 000000000..f6050a8ff --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm new file mode 100644 index 000000000..5959d698f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_avx_gen4.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm new file mode 100644 index 000000000..c583d02b8 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm new file mode 100644 index 000000000..5952a6005 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_sse.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm new file mode 100644 index 000000000..bd318fcd1 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm @@ -0,0 +1,32 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +;; single buffer implementation +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm new file mode 100644 index 000000000..da2f611b4 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%define NT_LDST +%define FUNCT_EXTENSION _nt +%include "gcm_vaes_avx512.asm" diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm new file mode 100644 index 000000000..90db18910 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm @@ -0,0 +1,2130 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen2.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION +%define NROUNDS 13 +%endif + +default rel +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba + vpshufd %%T2, %%GH, 01001110b + vpshufd %%T3, %%HK, 01001110b + vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0) + vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0) + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + vpxor %%T2, %%T2, %%GH + vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs + vpxor %%GH, %%GH, %%T3 + vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK + + ;first phase of the reduction + vpslld %%T2, %%GH, 31 ; packed right shifting << 31 + vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%GH,1 ; packed left shifting >> 1 + vpsrld %%T3,%%GH,2 ; packed left shifting >> 2 + vpsrld %%T4,%%GH,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpxor %%T2, %%T2, %%T5 + vpxor %%GH, %%GH, %%T2 + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + vmovdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: +; GDATA_KEY - struct gcm_key_data * +; GDATA_CTX - struct gcm_context_data * +; PLAIN_CYPH_IN - input text +; PLAIN_CYPH_LEN - input text length +; DATA_OFFSET - the current data offset +; ENC_DEC - whether encoding or decoding +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + cmp r13, rax + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX+PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + ; start AES for %%num_initial_blocks blocks + vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep ; NROUNDS + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + vpxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + vmovdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM1, %%CTR + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM2, %%CTR + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM3, %%CTR + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM4, %%CTR + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM5, %%CTR + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM6, %%CTR + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM7, %%CTR + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM8, %%CTR + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%T_key + vpxor %%XMM2, %%T_key + vpxor %%XMM3, %%T_key + vpxor %%XMM4, %%T_key + vpxor %%XMM5, %%T_key + vpxor %%XMM6, %%T_key + vpxor %%XMM7, %%T_key + vpxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; r11 is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONE] + vpaddd %%XMM3, %%XMM2, [ONE] + vpaddd %%XMM4, %%XMM3, [ONE] + vpaddd %%XMM5, %%XMM4, [ONE] + vpaddd %%XMM6, %%XMM5, [ONE] + vpaddd %%XMM7, %%XMM6, [ONE] + vpaddd %%XMM8, %%XMM7, [ONE] + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONEf] + vpaddd %%XMM3, %%XMM2, [ONEf] + vpaddd %%XMM4, %%XMM3, [ONEf] + vpaddd %%XMM5, %%XMM4, [ONEf] + vpaddd %%XMM6, %%XMM5, [ONEf] + vpaddd %%XMM7, %%XMM6, [ONEf] + vpaddd %%XMM8, %%XMM7, [ONEf] + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%T1 + vpxor %%XMM2, %%T1 + vpxor %%XMM3, %%T1 + vpxor %%XMM4, %%T1 + vpxor %%XMM5, %%T1 + vpxor %%XMM6, %%T1 + vpxor %%XMM7, %%T1 + vpxor %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + + vpshufd %%T6, %%T2, 01001110b + vpxor %%T6, %%T2 + + vmovdqu %%T5, [%%GDATA + HashKey_8_k] + vpclmulqdq %%T6, %%T6, %%T5, 0x00 ; + + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpxor %%T6, %%T4 + vpxor %%T6, %%T7 + +%ifdef GCM128_MODE + vmovdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif + +%assign i 0 +%assign j 1 +%rep 8 + +%ifidn %%ENC_DEC, ENC +%ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 +%else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] +%endif ; NT_LD + vaesenclast reg(j), reg(j), %%T2 +%else + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3 +%endif ; %%ENC_DEC + +%assign i (i+1) +%assign j (j+1) +%endrep + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T3 + vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7 + + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + %ifidn %%ENC_DEC, ENC + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer + %endif + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] + vpshufb %%XMM3, [SHUF_MASK] + vpshufb %%XMM4, [SHUF_MASK] + vpshufb %%XMM5, [SHUF_MASK] + vpshufb %%XMM6, [SHUF_MASK] + vpshufb %%XMM7, [SHUF_MASK] + vpshufb %%XMM8, [SHUF_MASK] + + + vpxor %%XMM1, %%T6 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +; %%GDATA is GCM key data +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + ;; Karatsuba Method + + + vpshufd %%T2, %%XMM1, 01001110b + vpxor %%T2, %%XMM1 + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vmovdqu %%T3, [%%GDATA + HashKey_8_k] + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM2, 01001110b + vpxor %%T2, %%XMM2 + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM3, 01001110b + vpxor %%T2, %%XMM3 + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM4, 01001110b + vpxor %%T2, %%XMM4 + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM5, 01001110b + vpxor %%T2, %%XMM5 + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM6, 01001110b + vpxor %%T2, %%XMM6 + vmovdqu %%T5, [%%GDATA + HashKey_3] + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM7, 01001110b + vpxor %%T2, %%XMM7 + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM8, 01001110b + vpxor %%T2, %%XMM8 + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T4 + vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + +%endmacro + + +; Encryption of a single block +; %%GDATA is GCM key data +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep ; NROUNDS + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX), +; IV, Additional Authentication data (A_IN), Additional +; Data length (A_LEN) +; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + vpxor xmm2, xmm3 + mov r10, %%A_LEN + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX), +; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET + add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN ; Update length of data processed + vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + + mov r13, %%PLAIN_CYPH_LEN + sub r13, %%DATA_OFFSET + mov r10, r13 ; save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14 + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + vpaddd xmm9, [ONE] ; INCR CNT to get Yn + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vpshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn) + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: +%ifidn %%ENC_DEC, DEC + vmovdqa xmm2, xmm1 + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpand xmm2, xmm1 + vpshufb xmm2, [SHUF_MASK] + vpxor xmm14, xmm2 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpshufb xmm9, [SHUF_MASK] + vpxor xmm14, xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + + vpxor xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen2 +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, 1 + vpsrlq xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, [POLY] + vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push r12 + push r13 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + vmovdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm new file mode 100644 index 000000000..4a0b4f82e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm @@ -0,0 +1,3277 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; The details of the implementation is explained in: +; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen4.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +;; Decide on AES-GCM key size to compile for +%ifdef GCM128_MODE +%define NROUNDS 9 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM192_MODE +%define NROUNDS 11 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM256_MODE +%define NROUNDS 13 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION +%endif + +section .text +default rel + +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 + vpxor %%GH, %%GH, %%T3 + + + vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs + vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs + + vpxor %%T1, %%T1, %%T3 + vpxor %%GH, %%GH, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%GH, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs + + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%GH, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%GH, %%T3, %%GH, 0x10 + vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + +%endmacro + + +; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4 +; functions, but are kept to allow users to switch cpu architectures between calls +; of pre, init, update, and finalize. +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + ;; NOTE: in current implementation check for zero length is obsolete here. + ;; The adequate checks are done by callers of this macro. + ;; cmp %%COUNTER, 0 + ;; je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + vmovdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +%macro GHASH_SINGLE_MUL 9 +%define %%GDATA %1 +%define %%HASHKEY %2 +%define %%CIPHER %3 +%define %%STATE_11 %4 +%define %%STATE_00 %5 +%define %%STATE_MID %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%FIRST %9 + + vmovdqu %%T1, [%%GDATA + %%HASHKEY] +%ifidn %%FIRST, first + vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 + vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%else + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 + vpxor %%STATE_11, %%STATE_11, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 + vpxor %%STATE_00, %%STATE_00, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 + vpxor %%STATE_MID, %%STATE_MID, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%endif + +%endmacro + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 23 +%define %%GDATA_KEY %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%LENGTH %4 +%define %%DATA_OFFSET %5 +%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 +%define %%CTR %12 +%define %%XMM1 %13 +%define %%XMM2 %14 +%define %%XMM3 %15 +%define %%XMM4 %16 +%define %%XMM5 %17 +%define %%XMM6 %18 +%define %%XMM7 %19 +%define %%XMM8 %20 +%define %%T6 %21 +%define %%T_key %22 +%define %%ENC_DEC %23 + +%assign i (8-%%num_initial_blocks) + ;; Move AAD_HASH to temp reg + vmovdqu %%T2, %%XMM8 + ;; Start AES for %%num_initial_blocks blocks + ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +%if(%%num_initial_blocks>0) +vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%endif ; %if(%%num_initial_blocks>0) + + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; Write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [SHUF_MASK] +%assign i (i+1) +%endrep + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%assign i (9-%%num_initial_blocks) +%if(%%num_initial_blocks>0) + vmovdqa %%T3, reg(i) +%assign i (i+1) + +%rep %%num_initial_blocks-1 + vmovdqu [rsp + TMP %+ i], reg(i) +%assign i (i+1) +%endrep +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Haskey_i_k holds XORed values of the low and high parts of + ;; the Haskey_i + vpaddd %%XMM1, %%CTR, [ONE] ; INCR Y0 + vpaddd %%XMM2, %%CTR, [TWO] ; INCR Y0 + vpaddd %%XMM3, %%XMM1, [TWO] ; INCR Y0 + vpaddd %%XMM4, %%XMM2, [TWO] ; INCR Y0 + vpaddd %%XMM5, %%XMM3, [TWO] ; INCR Y0 + vpaddd %%XMM6, %%XMM4, [TWO] ; INCR Y0 + vpaddd %%XMM7, %%XMM5, [TWO] ; INCR Y0 + vpaddd %%XMM8, %%XMM6, [TWO] ; INCR Y0 + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%XMM1, %%T_key + vpxor %%XMM2, %%XMM2, %%T_key + vpxor %%XMM3, %%XMM3, %%T_key + vpxor %%XMM4, %%XMM4, %%T_key + vpxor %%XMM5, %%XMM5, %%T_key + vpxor %%XMM6, %%XMM6, %%T_key + vpxor %%XMM7, %%XMM7, %%T_key + vpxor %%XMM8, %%XMM8, %%T_key + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) + +%define %%T4_2 %%T4 +%if(%%num_initial_blocks>0) + ;; Hash in AES state + ;; T2 - incoming AAD hash + vpxor %%T2, %%T3 + + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*1] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*2] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>1) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*3] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*4] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>2) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>3) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*5] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*6] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>4) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*7] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*8] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>5) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*9] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%ifndef GCM128_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*10] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>6) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%ifdef GCM128_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*10] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif + +%ifdef GCM192_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*11] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*12] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif +%ifdef GCM256_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*11] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*12] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>7) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%ifdef GCM256_MODE ; GCM256 + vmovdqu %%T_key, [%%GDATA_KEY+16*13] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*14] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif ; GCM256 mode + +%if(%%num_initial_blocks>0) + vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs + vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs + vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 + vpxor %%T4, %%T6, %%T4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; First phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T4, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + ;; First phase of the reduction complete + vpxor %%T4, %%T4, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T4, 0x00 + ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpsrldq %%T2, %%T2, 4 + + vpclmulqdq %%T4, %%T3, %%T4, 0x10 + ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + vpslldq %%T4, %%T4, 4 + ;; Second phase of the reduction complete + vpxor %%T4, %%T4, %%T2 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; The result is in %%T3 + vpxor %%T3, %%T1, %%T4 +%else + ;; The hash should end up in T3 + vmovdqa %%T3, %%T2 +%endif + + ;; Final hash is now in T3 +%if %%num_initial_blocks > 0 + ;; NOTE: obsolete in case %%num_initial_blocks = 0 + sub %%LENGTH, 16*%%num_initial_blocks +%endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + +%if %%num_initial_blocks > 0 + ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 + ;; This macro is executed for lenght 128 and up, + ;; zero length is checked in GCM_ENC_DEC. + ;; If the last block is partial then the xor will be done later + ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. + ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 + cmp %%LENGTH, 128 + jl %%_initial_skip_last_word_write +%endif + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + ;; Update %%LENGTH with the number of blocks processed + sub %%LENGTH, 16 + add %%DATA_OFFSET, 16 +%%_initial_skip_last_word_write: + sub %%LENGTH, 128-16 + add %%DATA_OFFSET, 128-16 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + ;; Combine GHASHed value with the corresponding ciphertext + vpxor %%XMM1, %%XMM1, %%T3 + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + +;;; INITIAL_BLOCKS macro with support for a partial final block. +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 25 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) +%define %%T1 %8 +%define %%T2 %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 +%define %%INSTANCE_TYPE %25 + +%assign i (8-%%num_initial_blocks) + ;; Move AAD_HASH to temp reg + vmovdqu %%T2, %%XMM8 + ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + ;; Compute AES counters + vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + ; Start AES for %%num_initial_blocks blocks + vpxor reg(i),reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Hash all but the last block of data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%assign i (9-%%num_initial_blocks) +%if %%num_initial_blocks > 0 +%rep %%num_initial_blocks-1 + ;; Encrypt the message for all but the last block + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [rel SHUF_MASK] +%assign i (i+1) +%endrep +%endif + ;; The final block of data may be <16B + sub %%LENGTH, 16*(%%num_initial_blocks-1) + +%if %%num_initial_blocks < 8 + ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. + ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. + cmp %%LENGTH, 16 + jl %%_small_initial_partial_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle a full length final block - encrypt and hash all blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + sub %%LENGTH, 16 + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + + ;; Encrypt the message + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [rel SHUF_MASK] + + ;; Hash all of the data +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 + +%if(%%num_initial_blocks>last_block_to_hash) + ;; Hash in AES state + vpxor %%T2, reg(j) + + ;; T2 - incoming AAD hash + ;; reg(i) holds ciphertext + ;; T5 - hash key + ;; T6 - updated xor + ;; reg(1)/xmm1 should now be available for tmp use + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%assign rep_count (%%num_initial_blocks-1) +%if rep_count > 0 +%rep rep_count + + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T3, reg(j), %%T5, 0x11 + vpxor %%T1, %%T1, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x00 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%endrep +%endif + ;; Record that a reduction is needed + mov r12, 1 + + jmp %%_small_initial_compute_hash + + +%endif ; %if %%num_initial_blocks < 8 + +%%_small_initial_partial_block: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle ghash for a <16B final block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; In this case if it's a single call to encrypt we can + ;; hash all of the data but if it's an init / update / finalize + ;; series of call we need to leave the last block if it's + ;; less than a full block of data. + + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) + ;; Handle a partial final block + ;; GDATA, KEY, T1, T2 + ;; r13 - length + ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long + ;; NOTE: could be replaced with %%LENGTH but at this point + ;; %%LENGTH is always less than 16. + ;; No PLAIN_CYPH_LEN argument available in this macro. + ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET + vpshufb reg(i), [SHUF_MASK] + +%ifidn %%INSTANCE_TYPE, multi_call +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks-1) +%assign last_block_to_hash 1 +%else +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 +%endif + +%if(%%num_initial_blocks>last_block_to_hash) + ;; Record that a reduction is needed + mov r12, 1 + ;; Hash in AES state + vpxor %%T2, reg(j) + + ;; T2 - incoming AAD hash + ;; reg(i) holds ciphertext + ;; T5 - hash key + ;; T6 - updated xor + ;; reg(1)/xmm1 should now be available for tmp use + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 +%else + ;; Record that a reduction is not needed - + ;; In this case no hashes are computed because there + ;; is only one initial block and it is < 16B in length. + mov r12, 0 +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%ifidn %%INSTANCE_TYPE, multi_call +%assign rep_count (%%num_initial_blocks-2) +%%_multi_call_hash: +%else +%assign rep_count (%%num_initial_blocks-1) +%endif +%if rep_count > 0 +%rep rep_count + + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T3, reg(j), %%T5, 0x11 + vpxor %%T1, %%T1, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x00 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%endrep +%endif + +%%_small_initial_compute_hash: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Ghash reduction +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if(%%num_initial_blocks=1) +%ifidn %%INSTANCE_TYPE, multi_call + ;; We only need to check if a reduction is needed if + ;; initial_blocks == 1 and init/update/final is being used. + ;; In this case we may just have a partial block, and that + ;; gets hashed in finalize. + cmp r12, 0 + je %%_no_reduction_needed +%endif +%endif + + vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs + vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs + vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 + vpxor %%T4, %%T6, %%T4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; First phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T4, 0x01 + ;; shift-L xmm2 2 DWs + vpslldq %%T2, %%T2, 8 + vpxor %%T4, %%T4, %%T2 + + ;; First phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Second phase of the reduction + + vpclmulqdq %%T2, %%T3, %%T4, 0x00 + ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpsrldq %%T2, %%T2, 4 + + vpclmulqdq %%T4, %%T3, %%T4, 0x10 + ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + vpslldq %%T4, %%T4, 4 + + vpxor %%T4, %%T4, %%T2 + ;; Second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T3, %%T1, %%T4 + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If using init/update/finalize, we need to xor any partial block data + ;; into the hash. +%if %%num_initial_blocks > 1 + ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place +%if %%num_initial_blocks != 8 + ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero + cmp qword [%%GDATA_CTX + PBlockLen], 0 + je %%_no_partial_block_xor +%endif ; %%num_initial_blocks != 8 + vpxor %%T3, %%T3, reg(8) +%%_no_partial_block_xor: +%endif ; %%num_initial_blocks > 1 +%endif ; %%INSTANCE_TYPE, multi_call + +%if(%%num_initial_blocks=1) +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%_no_reduction_needed case only valid for + ;; multi_call with initial_blocks = 1. + ;; Look for comment above around '_no_reduction_needed' + ;; The jmp below is obsolete as the code will fall through. + + ;; The result is in %%T3 + jmp %%_after_reduction + +%%_no_reduction_needed: + ;; The hash should end up in T3. The only way we should get here is if + ;; there is a partial block of data, so xor that into the hash. + vpxor %%T3, %%T2, reg(8) +%endif ; %%INSTANCE_TYPE = multi_call +%endif ; %%num_initial_blocks=1 + +%%_after_reduction: + ;; Final hash is now in T3 + +%endmacro ; INITIAL_BLOCKS_PARTIAL + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 23 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 +%define %%FULL_PARTIAL %23 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vmovdqu %%T5, [TWO] + vpaddd %%XMM2, %%CTR, %%T5 + vpaddd %%XMM3, %%XMM1, %%T5 + vpaddd %%XMM4, %%XMM2, %%T5 + vpaddd %%XMM5, %%XMM3, %%T5 + vpaddd %%XMM6, %%XMM4, %%T5 + vpaddd %%XMM7, %%XMM5, %%T5 + vpaddd %%XMM8, %%XMM6, %%T5 + vmovdqa %%CTR, %%XMM8 + + vmovdqu %%T5, [SHUF_MASK] + vpshufb %%XMM1, %%T5 ; perform a 16Byte swap + vpshufb %%XMM2, %%T5 ; perform a 16Byte swap + vpshufb %%XMM3, %%T5 ; perform a 16Byte swap + vpshufb %%XMM4, %%T5 ; perform a 16Byte swap + vpshufb %%XMM5, %%T5 ; perform a 16Byte swap + vpshufb %%XMM6, %%T5 ; perform a 16Byte swap + vpshufb %%XMM7, %%T5 ; perform a 16Byte swap + vpshufb %%XMM8, %%T5 ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vmovdqu %%T5, [TWOf] + vpaddd %%XMM2, %%CTR, %%T5 + vpaddd %%XMM3, %%XMM1, %%T5 + vpaddd %%XMM4, %%XMM2, %%T5 + vpaddd %%XMM5, %%XMM3, %%T5 + vpaddd %%XMM6, %%XMM4, %%T5 + vpaddd %%XMM7, %%XMM5, %%T5 + vpaddd %%XMM8, %%XMM6, %%T5 + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%XMM1, %%T1 + vpxor %%XMM2, %%XMM2, %%T1 + vpxor %%XMM3, %%XMM3, %%T1 + vpxor %%XMM4, %%XMM4, %%T1 + vpxor %%XMM5, %%XMM5, %%T1 + vpxor %%XMM6, %%XMM6, %%T1 + vpxor %%XMM7, %%XMM7, %%T1 + vpxor %%XMM8, %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T1, %%T4, %%T3 + + + vmovdqu %%T5, [%%GDATA + 16*10] + %ifndef GCM128_MODE ; GCM192 or GCM256 + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif ; GCM256 + +%assign i 0 +%assign j 1 +%rep 8 + + ;; SNP TBD: This is pretty ugly - consider whether just XORing the + ;; data in after vaesenclast is simpler and performant. Would + ;; also have to ripple it through partial block and ghash_mul_8. +%ifidn %%FULL_PARTIAL, full + %ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + vpxor %%T2, %%T2, %%T5 + %else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + %endif + + %ifidn %%ENC_DEC, ENC + vaesenclast reg(j), reg(j), %%T2 + %else + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 + %endif + +%else + ; Don't read the final data during partial block processing + %ifdef NT_LD + %if (i<7) + VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + vpxor %%T2, %%T2, %%T5 + %else + ;; Stage the key directly in T2 rather than hash it with plaintext + vmovdqu %%T2, %%T5 + %endif + %else + %if (i<7) + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + %else + ;; Stage the key directly in T2 rather than hash it with plaintext + vmovdqu %%T2, %%T5 + %endif + %endif + + %ifidn %%ENC_DEC, ENC + vaesenclast reg(j), reg(j), %%T2 + %else + %if (i<7) + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + ;; Do not read the data since it could fault + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 + %else + vaesenclast reg(j), reg(j), %%T2 + %endif + %endif +%endif + +%assign i (i+1) +%assign j (j+1) +%endrep + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T7, %%T3 + vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + %ifidn %%ENC_DEC, ENC + ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 + %ifidn %%FULL_PARTIAL, full + ;; Avoid writing past the buffer if handling a partial block + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 + %endif + %endif + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + + vpxor %%XMM1, %%T1 + + +%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ;; Karatsuba Method + + vmovdqu %%T5, [%%GDATA + HashKey_8] + + vpshufd %%T2, %%XMM1, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM1 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpshufd %%T2, %%XMM2, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM2 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpshufd %%T2, %%XMM3, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM3 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpshufd %%T2, %%XMM4, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM4 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpshufd %%T2, %%XMM5, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM5 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpshufd %%T2, %%XMM6, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM6 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpshufd %%T2, %%XMM7, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM7 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey] + vpshufd %%T2, %%XMM8, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM8 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T7, %%T4 + vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_7 15 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 + + ;; Karatsuba Method + + vmovdqu %%T5, [%%GDATA + HashKey_7] + + vpshufd %%T2, %%XMM1, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM1 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpshufd %%T2, %%XMM2, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM2 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpshufd %%T2, %%XMM3, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM3 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpshufd %%T2, %%XMM4, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM4 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpshufd %%T2, %%XMM5, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM5 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpshufd %%T2, %%XMM6, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM6 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_1] + vpshufd %%T2, %%XMM7, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM7 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T7, %%T4 + vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 +%endmacro + + + +;;; Handle encryption of the final partial block +;;; IN: +;;; r13 - Number of bytes to read +;;; MODIFIES: +;;; KEY - Key for encrypting the partial block +;;; HASH - Current hash value +;;; SMASHES: +;;; r10, r12, r15, rax +;;; T1, T2 +;;; Note: +;;; PLAIN_CYPH_LEN, %7, is passed only to determine +;;; if buffer is big enough to do a 16 byte read & shift. +;;; 'LT16' is passed here only if buffer is known to be smaller +;;; than 16 bytes. +;;; Any other value passed here will result in 16 byte read +;;; code path. +;;; TBD: Remove HASH from the instantiation +%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8 +%define %%KEY %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%CYPH_PLAIN_OUT %4 +%define %%PLAIN_CYPH_IN %5 +%define %%PLAIN_CYPH_LEN %6 +%define %%ENC_DEC %7 +%define %%DATA_OFFSET %8 + + ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting +%ifidn %%PLAIN_CYPH_LEN, LT16 + ;; Handle the case where the message is < 16 bytes + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + + ;; T1 - packed output + ;; r10 - input data address + ;; r13 - input data length + ;; r12, r15, rax - temp registers + READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 +%else + ;; Handle the case where the message is >= 16 bytes + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + ;; Receive the last <16 Byte block + vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes + ;; (r13 is the number of bytes in plaintext mod 16) + sub r12, r13 + ;; Get the appropriate shuffle mask + vmovdqu %%T2, [r12] + ;; shift right 16-r13 bytes + vpshufb %%T1, %%T2 +%endif ; %%PLAIN_CYPH_LEN, LT16 + + ;; At this point T1 contains the partial block data +%ifidn %%ENC_DEC, DEC + ;; Plaintext XOR E(K, Yn) + ;; Set aside the ciphertext + vmovdqa %%T2, %%T1 + vpxor %%KEY, %%KEY, %%T1 + ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext + vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] + ;; Mask out top 16-r13 bytes of ciphertext + vpand %%KEY, %%KEY, %%T1 + + ;; Prepare the ciphertext for the hash + ;; mask out top 16-r13 bytes of the plaintext + vpand %%T2, %%T2, %%T1 +%else + ;; Plaintext XOR E(K, Yn) + vpxor %%KEY, %%KEY, %%T1 + ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY + vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] + ;; Mask out top 16-r13 bytes of %%KEY + vpand %%KEY, %%KEY, %%T1 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Output r13 Bytes + vmovq rax, %%KEY + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq %%T1, %%KEY, 8 + vmovq rax, %%T1 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn %%ENC_DEC, DEC + ;; If decrypt, restore the ciphertext into %%KEY + vmovdqu %%KEY, %%T2 +%endif +%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK + + + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm14 +%define %%SUBHASH xmm1 + + + vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + mov r10, %%A_LEN + cmp r10, 0 + je %%_aad_is_zero + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + jmp %%_after_aad + +%%_aad_is_zero: + vpxor %%AAD_HASH, %%AAD_HASH + +%%_after_aad: + mov r10, %%A_LEN + vpxor xmm2, xmm3 + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + +%macro GCM_ENC_DEC_SMALL 12 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET %7 +%define %%LENGTH %8 +%define %%NUM_BLOCKS %9 +%define %%CTR %10 +%define %%HASH %11 +%define %%INSTANCE_TYPE %12 + + ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. + ;; cmp %%NUM_BLOCKS, 0 + ;; je %%_small_initial_blocks_encrypted + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + cmp %%NUM_BLOCKS, 5 + je %%_small_initial_num_blocks_is_5 + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + jmp %%_small_initial_num_blocks_is_1 + + +%%_small_initial_num_blocks_is_8: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_7: + ;; r13 - %%LENGTH + ;; xmm12 - T1 + ;; xmm13 - T2 + ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys + ;; xmm15 - T4 + ;; xmm11 - T5 + ;; xmm9 - CTR + ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 - AAD HASH IN + ;; xmm10 - T6 + ;; xmm0 - T_key + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_6: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_5: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_4: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_3: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_2: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_1: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + + ;; Note: zero initial blocks not allowed. + +%%_small_initial_blocks_encrypted: + +%endmacro ; GCM_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 7 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%INSTANCE_TYPE %7 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + ;; Update length of data processed + add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: partial block processing makes only sense for multi_call here. + ;; Used for the update flow - if there was a previous partial + ;; block fill the remaining bytes here. + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC +%endif + + ;; lift CTR set from initial_blocks to here +%ifidn %%INSTANCE_TYPE, single_call + vmovdqu xmm9, xmm2 +%else + vmovdqu xmm9, [%%GDATA_CTX + CurCount] +%endif + + ;; Save the amount of data left to process in r10 + mov r13, %%PLAIN_CYPH_LEN +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%DATA_OFFSET is zero in single_call case. + ;; Consequently PLAIN_CYPH_LEN will never be zero after + ;; %%DATA_OFFSET subtraction below. + sub r13, %%DATA_OFFSET + + ;; There may be no more data if it was consumed in the partial block. + cmp r13, 0 + je %%_enc_dec_done +%endif ; %%INSTANCE_TYPE, multi_call + mov r10, r13 + + ;; Determine how many blocks to process in INITIAL + mov r12, r13 + shr r12, 4 + and r12, 7 + + ;; Process one additional block in INITIAL if there is a partial block + and r10, 0xf + blsmsk r10, r10 ; Set CF if zero + cmc ; Flip CF + adc r12, 0x0 ; Process an additional INITIAL block if CF set + + ;; Less than 127B will be handled by the small message code, which + ;; can process up to 7 16B blocks. + cmp r13, 128 + jge %%_large_message_path + + GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE + jmp %%_ghash_done + +%%_large_message_path: + and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will + ; can be handled by the x8 partial loop. + + cmp r12, 0 + je %%_initial_num_blocks_is_0 + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + ;; r13 - %%LENGTH + ;; xmm12 - T1 + ;; xmm13 - T2 + ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys + ;; xmm15 - T4 + ;; xmm11 - T5 + ;; xmm9 - CTR + ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 - AAD HASH IN + ;; xmm10 - T6 + ;; xmm0 - T_key + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + ;; The entire message was encrypted processed in initial and now need to be hashed + cmp r13, 0 + je %%_encrypt_done + + ;; Encrypt the final <16 byte (partial) block, then hash + cmp r13, 16 + jl %%_encrypt_final_partial + + ;; Process 7 full blocks plus a partial block + cmp r13, 128 + jl %%_encrypt_by_8_partial + + +%%_encrypt_by_8_parallel: + ;; in_order vs. out_order is an optimization to increment the counter without shuffling + ;; it back into little endian. r15d keeps track of when we need to increent in order so + ;; that the carry is handled correctly. + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [rel SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + ;; xmm0 - T1 + ;; xmm10 - T2 + ;; xmm11 - T3 + ;; xmm12 - T4 + ;; xmm13 - T5 + ;; xmm14 - T6 + ;; xmm9 - CTR + ;; xmm1 - XMM1 + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 + ;; xmm15 - T7 + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full + add %%DATA_OFFSET, 128 + sub r13, 128 + cmp r13, 128 + jge %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_encrypt_by_8_parallel_done + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + cmp r13, 128 + jge %%_encrypt_by_8_new + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_parallel_done: + ;; Test to see if we need a by 8 with partial block. At this point + ;; bytes remaining should be either zero or between 113-127. + cmp r13, 0 + je %%_encrypt_done + +%%_encrypt_by_8_partial: + ;; Shuffle needed to align key for partial block xor. out_order + ;; is a little faster because it avoids extra shuffles. + ;; TBD: Might need to account for when we don't have room to increment the counter. + + + ;; Process parallel buffers with a final partial block. + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial + + + add %%DATA_OFFSET, 128-16 + sub r13, 128-16 + +%%_encrypt_final_partial: + + vpshufb xmm8, [SHUF_MASK] + mov [%%GDATA_CTX + PBlockLen], r13 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 + + ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext + ;; GDATA, KEY, T1, T2 + ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET + + vpshufb xmm8, [SHUF_MASK] + + +%%_encrypt_done: + + ;; Mapping to macro parameters + ;; IN: + ;; xmm9 contains the counter + ;; xmm1-xmm8 contain the xor'd ciphertext + ;; OUT: + ;; xmm14 contains the final hash + ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 +%ifidn %%INSTANCE_TYPE, multi_call + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + jz %%_hash_last_8 + GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + ;; XOR the partial word into the hash + vpxor xmm14, xmm14, xmm8 + jmp %%_ghash_done +%endif +%%_hash_last_8: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + +%%_ghash_done: + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 + +%%_enc_dec_done: + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%INSTANCE_TYPE %6 +%define %%PLAIN_CYPH_LEN rax + + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + ;; Start AES as early as possible + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If the GCM function is called as a single function call rather + ;; than invoking the individual parts (init, update, finalize) we + ;; can remove a write to read dependency on AadHash. + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + + ;; Encrypt the final partial block. If we did this as a single call then + ;; the partial block was handled in the main GCM_ENC_DEC macro. + mov r12, [%%GDATA_CTX + PBlockLen] + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + +%endif + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vpxor xmm9, xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen4 / +; aes_gcm_precomp_192_avx_gen4 / +; aes_gcm_precomp_256_avx_gen4 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, xmm2, [POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + vmovdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / +; aes_gcm_enc_128_update_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / +; aes_gcm_dec_256_update_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / +; aes_gcm_enc_256_finalize_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm11 + vmovdqu [rsp + 3*16], xmm14 + vmovdqu [rsp + 4*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 4*16] + vmovdqu xmm14, [rsp + 3*16] + vmovdqu xmm11, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 +; aes_gcm_dec_256_finalize_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm11 + vmovdqu [rsp + 3*16], xmm14 + vmovdqu [rsp + 4*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 4*16] + vmovdqu xmm14, [rsp + 3*16] + vmovdqu xmm11, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call + + FUNC_RESTORE + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm new file mode 100644 index 000000000..e823b7959 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm @@ -0,0 +1,291 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef GCM_DEFINES_ASM_INCLUDED +%define GCM_DEFINES_ASM_INCLUDED + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford + + +;;;;;; + +section .data + +align 16 + +POLY dq 0x0000000000000001, 0xC200000000000000 + +align 64 +POLY2 dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 + dq 0x00000001C2000000, 0xC200000000000000 +align 16 +TWOONE dq 0x0000000000000001, 0x0000000100000000 + +; order of these constants should not change. +; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F + +align 64 +SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 +ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff +ZERO dq 0x0000000000000000, 0x0000000000000000 +ONE dq 0x0000000000000001, 0x0000000000000000 +TWO dq 0x0000000000000002, 0x0000000000000000 +ONEf dq 0x0000000000000000, 0x0100000000000000 +TWOf dq 0x0000000000000000, 0x0200000000000000 + +align 64 +ddq_add_1234: + dq 0x0000000000000001, 0x0000000000000000 + dq 0x0000000000000002, 0x0000000000000000 + dq 0x0000000000000003, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_5678: + dq 0x0000000000000005, 0x0000000000000000 + dq 0x0000000000000006, 0x0000000000000000 + dq 0x0000000000000007, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_add_4444: + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_8888: + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_addbe_1234: + dq 0x0000000000000000, 0x0100000000000000 + dq 0x0000000000000000, 0x0200000000000000 + dq 0x0000000000000000, 0x0300000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_5678: + dq 0x0000000000000000, 0x0500000000000000 + dq 0x0000000000000000, 0x0600000000000000 + dq 0x0000000000000000, 0x0700000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +ddq_addbe_4444: + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + dq 0x0000000000000000, 0x0400000000000000 + +align 64 +ddq_addbe_8888: + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + dq 0x0000000000000000, 0x0800000000000000 + +align 64 +byte_len_to_mask_table: + dw 0x0000, 0x0001, 0x0003, 0x0007, + dw 0x000f, 0x001f, 0x003f, 0x007f, + dw 0x00ff, 0x01ff, 0x03ff, 0x07ff, + dw 0x0fff, 0x1fff, 0x3fff, 0x7fff, + dw 0xffff + +align 64 +byte64_len_to_mask_table: + dq 0x0000000000000000, 0x0000000000000001 + dq 0x0000000000000003, 0x0000000000000007 + dq 0x000000000000000f, 0x000000000000001f + dq 0x000000000000003f, 0x000000000000007f + dq 0x00000000000000ff, 0x00000000000001ff + dq 0x00000000000003ff, 0x00000000000007ff + dq 0x0000000000000fff, 0x0000000000001fff + dq 0x0000000000003fff, 0x0000000000007fff + dq 0x000000000000ffff, 0x000000000001ffff + dq 0x000000000003ffff, 0x000000000007ffff + dq 0x00000000000fffff, 0x00000000001fffff + dq 0x00000000003fffff, 0x00000000007fffff + dq 0x0000000000ffffff, 0x0000000001ffffff + dq 0x0000000003ffffff, 0x0000000007ffffff + dq 0x000000000fffffff, 0x000000001fffffff + dq 0x000000003fffffff, 0x000000007fffffff + dq 0x00000000ffffffff, 0x00000001ffffffff + dq 0x00000003ffffffff, 0x00000007ffffffff + dq 0x0000000fffffffff, 0x0000001fffffffff + dq 0x0000003fffffffff, 0x0000007fffffffff + dq 0x000000ffffffffff, 0x000001ffffffffff + dq 0x000003ffffffffff, 0x000007ffffffffff + dq 0x00000fffffffffff, 0x00001fffffffffff + dq 0x00003fffffffffff, 0x00007fffffffffff + dq 0x0000ffffffffffff, 0x0001ffffffffffff + dq 0x0003ffffffffffff, 0x0007ffffffffffff + dq 0x000fffffffffffff, 0x001fffffffffffff + dq 0x003fffffffffffff, 0x007fffffffffffff + dq 0x00ffffffffffffff, 0x01ffffffffffffff + dq 0x03ffffffffffffff, 0x07ffffffffffffff + dq 0x0fffffffffffffff, 0x1fffffffffffffff + dq 0x3fffffffffffffff, 0x7fffffffffffffff + dq 0xffffffffffffffff + +align 64 +mask_out_top_block: + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0xffffffffffffffff, 0xffffffffffffffff + dq 0x0000000000000000, 0x0000000000000000 + +section .text + + +;;define the fields of gcm_data struct +;typedef struct gcm_data +;{ +; u8 expanded_keys[16*15]; +; u8 shifted_hkey_1[16]; // store HashKey <<1 mod poly here +; u8 shifted_hkey_2[16]; // store HashKey^2 <<1 mod poly here +; u8 shifted_hkey_3[16]; // store HashKey^3 <<1 mod poly here +; u8 shifted_hkey_4[16]; // store HashKey^4 <<1 mod poly here +; u8 shifted_hkey_5[16]; // store HashKey^5 <<1 mod poly here +; u8 shifted_hkey_6[16]; // store HashKey^6 <<1 mod poly here +; u8 shifted_hkey_7[16]; // store HashKey^7 <<1 mod poly here +; u8 shifted_hkey_8[16]; // store HashKey^8 <<1 mod poly here +; u8 shifted_hkey_1_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_2_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_3_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_4_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_5_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_6_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_7_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +; u8 shifted_hkey_8_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +;} gcm_data; + +%ifndef GCM_KEYS_VAES_AVX512_INCLUDED +%define HashKey 16*15 ; store HashKey <<1 mod poly here +%define HashKey_1 16*15 ; store HashKey <<1 mod poly here +%define HashKey_2 16*16 ; store HashKey^2 <<1 mod poly here +%define HashKey_3 16*17 ; store HashKey^3 <<1 mod poly here +%define HashKey_4 16*18 ; store HashKey^4 <<1 mod poly here +%define HashKey_5 16*19 ; store HashKey^5 <<1 mod poly here +%define HashKey_6 16*20 ; store HashKey^6 <<1 mod poly here +%define HashKey_7 16*21 ; store HashKey^7 <<1 mod poly here +%define HashKey_8 16*22 ; store HashKey^8 <<1 mod poly here +%define HashKey_k 16*23 ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) +%define HashKey_2_k 16*24 ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_3_k 16*25 ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_4_k 16*26 ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_5_k 16*27 ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_6_k 16*28 ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_7_k 16*29 ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +%define HashKey_8_k 16*30 ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +%endif + +%define AadHash 16*0 ; store current Hash of data which has been input +%define AadLen 16*1 ; store length of input data which will not be encrypted or decrypted +%define InLen (16*1)+8 ; store length of input data which will be encrypted or decrypted +%define PBlockEncKey 16*2 ; encryption key for the partial block at the end of the previous update +%define OrigIV 16*3 ; input IV +%define CurCount 16*4 ; Current counter for generation of encryption key +%define PBlockLen 16*5 ; length of partial block at the end of the previous update + +%define reg(q) xmm %+ q +%define arg(x) [r14 + STACK_OFFSET + 8*x] + + + + +%ifnidn __OUTPUT_FORMAT__, elf64 + %xdefine arg1 rcx + %xdefine arg2 rdx + %xdefine arg3 r8 + %xdefine arg4 r9 + %xdefine arg5 rsi ;[r14 + STACK_OFFSET + 8*5] - need push and load + %xdefine arg6 [r14 + STACK_OFFSET + 8*6] + %xdefine arg7 [r14 + STACK_OFFSET + 8*7] + %xdefine arg8 [r14 + STACK_OFFSET + 8*8] + %xdefine arg9 [r14 + STACK_OFFSET + 8*9] + %xdefine arg10 [r14 + STACK_OFFSET + 8*10] + +%else + %xdefine arg1 rdi + %xdefine arg2 rsi + %xdefine arg3 rdx + %xdefine arg4 rcx + %xdefine arg5 r8 + %xdefine arg6 r9 + %xdefine arg7 [r14 + STACK_OFFSET + 8*1] + %xdefine arg8 [r14 + STACK_OFFSET + 8*2] + %xdefine arg9 [r14 + STACK_OFFSET + 8*3] + %xdefine arg10 [r14 + STACK_OFFSET + 8*4] +%endif + +%ifdef NT_LDST + %define NT_LD + %define NT_ST +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_LD + %define XLDR movntdqa + %define VXLDR vmovntdqa + %define VX512LDR vmovntdqa +%else + %define XLDR movdqu + %define VXLDR vmovdqu + %define VX512LDR vmovdqu8 +%endif + +;;; Use Non-temporal load/stor +%ifdef NT_ST + %define XSTR movntdq + %define VXSTR vmovntdq + %define VX512STR vmovntdq +%else + %define XSTR movdqu + %define VXSTR vmovdqu + %define VX512STR vmovdqu8 +%endif + +%endif ; GCM_DEFINES_ASM_INCLUDED diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm new file mode 100644 index 000000000..fd8aa05a6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm @@ -0,0 +1,233 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifndef GCM_KEYS_VAES_AVX512_INCLUDED +%define GCM_KEYS_VAES_AVX512_INCLUDED + +;; Define the fields of gcm_key_data struct: +;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; +;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)]; +;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly +;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly +;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly +;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly +;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly +;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly +;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly +;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly + +%ifdef GCM_BIG_DATA +;; +;; Key structure holds up to 128 ghash keys +;; +%define HashKey_128 (16*15) ; HashKey^128 <<1 mod poly +%define HashKey_127 (16*16) ; HashKey^127 <<1 mod poly +%define HashKey_126 (16*17) ; HashKey^126 <<1 mod poly +%define HashKey_125 (16*18) ; HashKey^125 <<1 mod poly +%define HashKey_124 (16*19) ; HashKey^124 <<1 mod poly +%define HashKey_123 (16*20) ; HashKey^123 <<1 mod poly +%define HashKey_122 (16*21) ; HashKey^122 <<1 mod poly +%define HashKey_121 (16*22) ; HashKey^121 <<1 mod poly +%define HashKey_120 (16*23) ; HashKey^120 <<1 mod poly +%define HashKey_119 (16*24) ; HashKey^119 <<1 mod poly +%define HashKey_118 (16*25) ; HashKey^118 <<1 mod poly +%define HashKey_117 (16*26) ; HashKey^117 <<1 mod poly +%define HashKey_116 (16*27) ; HashKey^116 <<1 mod poly +%define HashKey_115 (16*28) ; HashKey^115 <<1 mod poly +%define HashKey_114 (16*29) ; HashKey^114 <<1 mod poly +%define HashKey_113 (16*30) ; HashKey^113 <<1 mod poly +%define HashKey_112 (16*31) ; HashKey^112 <<1 mod poly +%define HashKey_111 (16*32) ; HashKey^111 <<1 mod poly +%define HashKey_110 (16*33) ; HashKey^110 <<1 mod poly +%define HashKey_109 (16*34) ; HashKey^109 <<1 mod poly +%define HashKey_108 (16*35) ; HashKey^108 <<1 mod poly +%define HashKey_107 (16*36) ; HashKey^107 <<1 mod poly +%define HashKey_106 (16*37) ; HashKey^106 <<1 mod poly +%define HashKey_105 (16*38) ; HashKey^105 <<1 mod poly +%define HashKey_104 (16*39) ; HashKey^104 <<1 mod poly +%define HashKey_103 (16*40) ; HashKey^103 <<1 mod poly +%define HashKey_102 (16*41) ; HashKey^102 <<1 mod poly +%define HashKey_101 (16*42) ; HashKey^101 <<1 mod poly +%define HashKey_100 (16*43) ; HashKey^100 <<1 mod poly +%define HashKey_99 (16*44) ; HashKey^99 <<1 mod poly +%define HashKey_98 (16*45) ; HashKey^98 <<1 mod poly +%define HashKey_97 (16*46) ; HashKey^97 <<1 mod poly +%define HashKey_96 (16*47) ; HashKey^96 <<1 mod poly +%define HashKey_95 (16*48) ; HashKey^95 <<1 mod poly +%define HashKey_94 (16*49) ; HashKey^94 <<1 mod poly +%define HashKey_93 (16*50) ; HashKey^93 <<1 mod poly +%define HashKey_92 (16*51) ; HashKey^92 <<1 mod poly +%define HashKey_91 (16*52) ; HashKey^91 <<1 mod poly +%define HashKey_90 (16*53) ; HashKey^90 <<1 mod poly +%define HashKey_89 (16*54) ; HashKey^89 <<1 mod poly +%define HashKey_88 (16*55) ; HashKey^88 <<1 mod poly +%define HashKey_87 (16*56) ; HashKey^87 <<1 mod poly +%define HashKey_86 (16*57) ; HashKey^86 <<1 mod poly +%define HashKey_85 (16*58) ; HashKey^85 <<1 mod poly +%define HashKey_84 (16*59) ; HashKey^84 <<1 mod poly +%define HashKey_83 (16*60) ; HashKey^83 <<1 mod poly +%define HashKey_82 (16*61) ; HashKey^82 <<1 mod poly +%define HashKey_81 (16*62) ; HashKey^81 <<1 mod poly +%define HashKey_80 (16*63) ; HashKey^80 <<1 mod poly +%define HashKey_79 (16*64) ; HashKey^79 <<1 mod poly +%define HashKey_78 (16*65) ; HashKey^78 <<1 mod poly +%define HashKey_77 (16*66) ; HashKey^77 <<1 mod poly +%define HashKey_76 (16*67) ; HashKey^76 <<1 mod poly +%define HashKey_75 (16*68) ; HashKey^75 <<1 mod poly +%define HashKey_74 (16*69) ; HashKey^74 <<1 mod poly +%define HashKey_73 (16*70) ; HashKey^73 <<1 mod poly +%define HashKey_72 (16*71) ; HashKey^72 <<1 mod poly +%define HashKey_71 (16*72) ; HashKey^71 <<1 mod poly +%define HashKey_70 (16*73) ; HashKey^70 <<1 mod poly +%define HashKey_69 (16*74) ; HashKey^69 <<1 mod poly +%define HashKey_68 (16*75) ; HashKey^68 <<1 mod poly +%define HashKey_67 (16*76) ; HashKey^67 <<1 mod poly +%define HashKey_66 (16*77) ; HashKey^66 <<1 mod poly +%define HashKey_65 (16*78) ; HashKey^65 <<1 mod poly +%define HashKey_64 (16*79) ; HashKey^64 <<1 mod poly +%define HashKey_63 (16*80) ; HashKey^63 <<1 mod poly +%define HashKey_62 (16*81) ; HashKey^62 <<1 mod poly +%define HashKey_61 (16*82) ; HashKey^61 <<1 mod poly +%define HashKey_60 (16*83) ; HashKey^60 <<1 mod poly +%define HashKey_59 (16*84) ; HashKey^59 <<1 mod poly +%define HashKey_58 (16*85) ; HashKey^58 <<1 mod poly +%define HashKey_57 (16*86) ; HashKey^57 <<1 mod poly +%define HashKey_56 (16*87) ; HashKey^56 <<1 mod poly +%define HashKey_55 (16*88) ; HashKey^55 <<1 mod poly +%define HashKey_54 (16*89) ; HashKey^54 <<1 mod poly +%define HashKey_53 (16*90) ; HashKey^53 <<1 mod poly +%define HashKey_52 (16*91) ; HashKey^52 <<1 mod poly +%define HashKey_51 (16*92) ; HashKey^51 <<1 mod poly +%define HashKey_50 (16*93) ; HashKey^50 <<1 mod poly +%define HashKey_49 (16*94) ; HashKey^49 <<1 mod poly +%define HashKey_48 (16*95) ; HashKey^48 <<1 mod poly +%define HashKey_47 (16*96) ; HashKey^47 <<1 mod poly +%define HashKey_46 (16*97) ; HashKey^46 <<1 mod poly +%define HashKey_45 (16*98) ; HashKey^45 <<1 mod poly +%define HashKey_44 (16*99) ; HashKey^44 <<1 mod poly +%define HashKey_43 (16*100) ; HashKey^43 <<1 mod poly +%define HashKey_42 (16*101) ; HashKey^42 <<1 mod poly +%define HashKey_41 (16*102) ; HashKey^41 <<1 mod poly +%define HashKey_40 (16*103) ; HashKey^40 <<1 mod poly +%define HashKey_39 (16*104) ; HashKey^39 <<1 mod poly +%define HashKey_38 (16*105) ; HashKey^38 <<1 mod poly +%define HashKey_37 (16*106) ; HashKey^37 <<1 mod poly +%define HashKey_36 (16*107) ; HashKey^36 <<1 mod poly +%define HashKey_35 (16*108) ; HashKey^35 <<1 mod poly +%define HashKey_34 (16*109) ; HashKey^34 <<1 mod poly +%define HashKey_33 (16*110) ; HashKey^33 <<1 mod poly +%define HashKey_32 (16*111) ; HashKey^32 <<1 mod poly +%define HashKey_31 (16*112) ; HashKey^31 <<1 mod poly +%define HashKey_30 (16*113) ; HashKey^30 <<1 mod poly +%define HashKey_29 (16*114) ; HashKey^29 <<1 mod poly +%define HashKey_28 (16*115) ; HashKey^28 <<1 mod poly +%define HashKey_27 (16*116) ; HashKey^27 <<1 mod poly +%define HashKey_26 (16*117) ; HashKey^26 <<1 mod poly +%define HashKey_25 (16*118) ; HashKey^25 <<1 mod poly +%define HashKey_24 (16*119) ; HashKey^24 <<1 mod poly +%define HashKey_23 (16*120) ; HashKey^23 <<1 mod poly +%define HashKey_22 (16*121) ; HashKey^22 <<1 mod poly +%define HashKey_21 (16*122) ; HashKey^21 <<1 mod poly +%define HashKey_20 (16*123) ; HashKey^20 <<1 mod poly +%define HashKey_19 (16*124) ; HashKey^19 <<1 mod poly +%define HashKey_18 (16*125) ; HashKey^18 <<1 mod poly +%define HashKey_17 (16*126) ; HashKey^17 <<1 mod poly +%define HashKey_16 (16*127) ; HashKey^16 <<1 mod poly +%define HashKey_15 (16*128) ; HashKey^15 <<1 mod poly +%define HashKey_14 (16*129) ; HashKey^14 <<1 mod poly +%define HashKey_13 (16*130) ; HashKey^13 <<1 mod poly +%define HashKey_12 (16*131) ; HashKey^12 <<1 mod poly +%define HashKey_11 (16*132) ; HashKey^11 <<1 mod poly +%define HashKey_10 (16*133) ; HashKey^10 <<1 mod poly +%define HashKey_9 (16*134) ; HashKey^9 <<1 mod poly +%define HashKey_8 (16*135) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*136) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*137) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*138) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*139) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*140) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*141) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*142) ; HashKey <<1 mod poly +%define HashKey (16*142) ; HashKey <<1 mod poly +%else +;; +;; Key structure holds up to 48 ghash keys +;; +%define HashKey_48 (16*15) ; HashKey^48 <<1 mod poly +%define HashKey_47 (16*16) ; HashKey^47 <<1 mod poly +%define HashKey_46 (16*17) ; HashKey^46 <<1 mod poly +%define HashKey_45 (16*18) ; HashKey^45 <<1 mod poly +%define HashKey_44 (16*19) ; HashKey^44 <<1 mod poly +%define HashKey_43 (16*20) ; HashKey^43 <<1 mod poly +%define HashKey_42 (16*21) ; HashKey^42 <<1 mod poly +%define HashKey_41 (16*22) ; HashKey^41 <<1 mod poly +%define HashKey_40 (16*23) ; HashKey^40 <<1 mod poly +%define HashKey_39 (16*24) ; HashKey^39 <<1 mod poly +%define HashKey_38 (16*25) ; HashKey^38 <<1 mod poly +%define HashKey_37 (16*26) ; HashKey^37 <<1 mod poly +%define HashKey_36 (16*27) ; HashKey^36 <<1 mod poly +%define HashKey_35 (16*28) ; HashKey^35 <<1 mod poly +%define HashKey_34 (16*29) ; HashKey^34 <<1 mod poly +%define HashKey_33 (16*30) ; HashKey^33 <<1 mod poly +%define HashKey_32 (16*31) ; HashKey^32 <<1 mod poly +%define HashKey_31 (16*32) ; HashKey^31 <<1 mod poly +%define HashKey_30 (16*33) ; HashKey^30 <<1 mod poly +%define HashKey_29 (16*34) ; HashKey^29 <<1 mod poly +%define HashKey_28 (16*35) ; HashKey^28 <<1 mod poly +%define HashKey_27 (16*36) ; HashKey^27 <<1 mod poly +%define HashKey_26 (16*37) ; HashKey^26 <<1 mod poly +%define HashKey_25 (16*38) ; HashKey^25 <<1 mod poly +%define HashKey_24 (16*39) ; HashKey^24 <<1 mod poly +%define HashKey_23 (16*40) ; HashKey^23 <<1 mod poly +%define HashKey_22 (16*41) ; HashKey^22 <<1 mod poly +%define HashKey_21 (16*42) ; HashKey^21 <<1 mod poly +%define HashKey_20 (16*43) ; HashKey^20 <<1 mod poly +%define HashKey_19 (16*44) ; HashKey^19 <<1 mod poly +%define HashKey_18 (16*45) ; HashKey^18 <<1 mod poly +%define HashKey_17 (16*46) ; HashKey^17 <<1 mod poly +%define HashKey_16 (16*47) ; HashKey^16 <<1 mod poly +%define HashKey_15 (16*48) ; HashKey^15 <<1 mod poly +%define HashKey_14 (16*49) ; HashKey^14 <<1 mod poly +%define HashKey_13 (16*50) ; HashKey^13 <<1 mod poly +%define HashKey_12 (16*51) ; HashKey^12 <<1 mod poly +%define HashKey_11 (16*52) ; HashKey^11 <<1 mod poly +%define HashKey_10 (16*53) ; HashKey^10 <<1 mod poly +%define HashKey_9 (16*54) ; HashKey^9 <<1 mod poly +%define HashKey_8 (16*55) ; HashKey^8 <<1 mod poly +%define HashKey_7 (16*56) ; HashKey^7 <<1 mod poly +%define HashKey_6 (16*57) ; HashKey^6 <<1 mod poly +%define HashKey_5 (16*58) ; HashKey^5 <<1 mod poly +%define HashKey_4 (16*59) ; HashKey^4 <<1 mod poly +%define HashKey_3 (16*60) ; HashKey^3 <<1 mod poly +%define HashKey_2 (16*61) ; HashKey^2 <<1 mod poly +%define HashKey_1 (16*62) ; HashKey <<1 mod poly +%define HashKey (16*62) ; HashKey <<1 mod poly +%endif ; !GCM_BIG_DATA + +%endif ; GCM_KEYS_VAES_AVX512_INCLUDED diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm new file mode 100644 index 000000000..6f71e43fa --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm @@ -0,0 +1,184 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern aes_gcm_init_128_sse +extern aes_gcm_init_128_avx_gen4 +extern aes_gcm_init_128_avx_gen2 + +extern aes_gcm_enc_128_sse +extern aes_gcm_enc_128_avx_gen4 +extern aes_gcm_enc_128_avx_gen2 +extern aes_gcm_enc_128_update_sse +extern aes_gcm_enc_128_update_avx_gen4 +extern aes_gcm_enc_128_update_avx_gen2 +extern aes_gcm_enc_128_finalize_sse +extern aes_gcm_enc_128_finalize_avx_gen4 +extern aes_gcm_enc_128_finalize_avx_gen2 + +extern aes_gcm_dec_128_sse +extern aes_gcm_dec_128_avx_gen4 +extern aes_gcm_dec_128_avx_gen2 +extern aes_gcm_dec_128_update_sse +extern aes_gcm_dec_128_update_avx_gen4 +extern aes_gcm_dec_128_update_avx_gen2 +extern aes_gcm_dec_128_finalize_sse +extern aes_gcm_dec_128_finalize_avx_gen4 +extern aes_gcm_dec_128_finalize_avx_gen2 + +extern aes_gcm_precomp_128_sse +extern aes_gcm_precomp_128_avx_gen4 +extern aes_gcm_precomp_128_avx_gen2 + +extern aes_gcm_init_256_sse +extern aes_gcm_init_256_avx_gen4 +extern aes_gcm_init_256_avx_gen2 + +extern aes_gcm_enc_256_sse +extern aes_gcm_enc_256_avx_gen4 +extern aes_gcm_enc_256_avx_gen2 +extern aes_gcm_enc_256_update_sse +extern aes_gcm_enc_256_update_avx_gen4 +extern aes_gcm_enc_256_update_avx_gen2 +extern aes_gcm_enc_256_finalize_sse +extern aes_gcm_enc_256_finalize_avx_gen4 +extern aes_gcm_enc_256_finalize_avx_gen2 + +extern aes_gcm_dec_256_sse +extern aes_gcm_dec_256_avx_gen4 +extern aes_gcm_dec_256_avx_gen2 +extern aes_gcm_dec_256_update_sse +extern aes_gcm_dec_256_update_avx_gen4 +extern aes_gcm_dec_256_update_avx_gen2 +extern aes_gcm_dec_256_finalize_sse +extern aes_gcm_dec_256_finalize_avx_gen4 +extern aes_gcm_dec_256_finalize_avx_gen2 + +extern aes_gcm_precomp_256_sse +extern aes_gcm_precomp_256_avx_gen4 +extern aes_gcm_precomp_256_avx_gen2 + +%if (AS_FEATURE_LEVEL) >= 10 +extern aes_gcm_precomp_128_vaes_avx512 +extern aes_gcm_init_128_vaes_avx512 +extern aes_gcm_enc_128_update_vaes_avx512 +extern aes_gcm_dec_128_update_vaes_avx512 +extern aes_gcm_enc_128_finalize_vaes_avx512 +extern aes_gcm_dec_128_finalize_vaes_avx512 +extern aes_gcm_enc_128_vaes_avx512 +extern aes_gcm_dec_128_vaes_avx512 + +extern aes_gcm_precomp_256_vaes_avx512 +extern aes_gcm_init_256_vaes_avx512 +extern aes_gcm_enc_256_update_vaes_avx512 +extern aes_gcm_dec_256_update_vaes_avx512 +extern aes_gcm_enc_256_finalize_vaes_avx512 +extern aes_gcm_dec_256_finalize_vaes_avx512 +extern aes_gcm_enc_256_vaes_avx512 +extern aes_gcm_dec_256_vaes_avx512 +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp +;;;; +mbin_interface aes_gcm_init_128 +mbin_dispatch_init7 aes_gcm_init_128, aes_gcm_init_128_sse, aes_gcm_init_128_sse, aes_gcm_init_128_avx_gen2, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_vaes_avx512 + +mbin_interface aes_gcm_enc_128 +mbin_dispatch_init7 aes_gcm_enc_128, aes_gcm_enc_128_sse, aes_gcm_enc_128_sse, aes_gcm_enc_128_avx_gen2, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_vaes_avx512 + +mbin_interface aes_gcm_enc_128_update +mbin_dispatch_init7 aes_gcm_enc_128_update, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_avx_gen2, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_vaes_avx512 + +mbin_interface aes_gcm_enc_128_finalize +mbin_dispatch_init7 aes_gcm_enc_128_finalize, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_avx_gen2, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_vaes_avx512 + +mbin_interface aes_gcm_dec_128 +mbin_dispatch_init7 aes_gcm_dec_128, aes_gcm_dec_128_sse, aes_gcm_dec_128_sse, aes_gcm_dec_128_avx_gen2, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_vaes_avx512 + +mbin_interface aes_gcm_dec_128_update +mbin_dispatch_init7 aes_gcm_dec_128_update, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_avx_gen2, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_vaes_avx512 + +mbin_interface aes_gcm_dec_128_finalize +mbin_dispatch_init7 aes_gcm_dec_128_finalize, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_avx_gen2, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_vaes_avx512 + +mbin_interface aes_gcm_precomp_128 +mbin_dispatch_init7 aes_gcm_precomp_128, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_avx_gen2, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_vaes_avx512 + +;;;; +; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp +;;;; +mbin_interface aes_gcm_init_256 +mbin_dispatch_init7 aes_gcm_init_256, aes_gcm_init_256_sse, aes_gcm_init_256_sse, aes_gcm_init_256_avx_gen2, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_vaes_avx512 + +mbin_interface aes_gcm_enc_256 +mbin_dispatch_init7 aes_gcm_enc_256, aes_gcm_enc_256_sse, aes_gcm_enc_256_sse, aes_gcm_enc_256_avx_gen2, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_vaes_avx512 + +mbin_interface aes_gcm_enc_256_update +mbin_dispatch_init7 aes_gcm_enc_256_update, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_avx_gen2, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_vaes_avx512 + +mbin_interface aes_gcm_enc_256_finalize +mbin_dispatch_init7 aes_gcm_enc_256_finalize, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_avx_gen2, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_vaes_avx512 + +mbin_interface aes_gcm_dec_256 +mbin_dispatch_init7 aes_gcm_dec_256, aes_gcm_dec_256_sse, aes_gcm_dec_256_sse, aes_gcm_dec_256_avx_gen2, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_vaes_avx512 + +mbin_interface aes_gcm_dec_256_update +mbin_dispatch_init7 aes_gcm_dec_256_update, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_avx_gen2, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_vaes_avx512 + +mbin_interface aes_gcm_dec_256_finalize +mbin_dispatch_init7 aes_gcm_dec_256_finalize, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_avx_gen2, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_vaes_avx512 + +mbin_interface aes_gcm_precomp_256 +mbin_dispatch_init7 aes_gcm_precomp_256, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_avx_gen2, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_vaes_avx512 + + +;;; func core, ver, snum +slversion aes_gcm_enc_128, 00, 00, 02c0 +slversion aes_gcm_dec_128, 00, 00, 02c1 +slversion aes_gcm_init_128, 00, 00, 02c2 +slversion aes_gcm_enc_128_update, 00, 00, 02c3 +slversion aes_gcm_dec_128_update, 00, 00, 02c4 +slversion aes_gcm_enc_128_finalize, 00, 00, 02c5 +slversion aes_gcm_dec_128_finalize, 00, 00, 02c6 +slversion aes_gcm_enc_256, 00, 00, 02d0 +slversion aes_gcm_dec_256, 00, 00, 02d1 +slversion aes_gcm_init_256, 00, 00, 02d2 +slversion aes_gcm_enc_256_update, 00, 00, 02d3 +slversion aes_gcm_dec_256_update, 00, 00, 02d4 +slversion aes_gcm_enc_256_finalize, 00, 00, 02d5 +slversion aes_gcm_dec_256_finalize, 00, 00, 02d6 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm new file mode 100644 index 000000000..4c5083173 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm @@ -0,0 +1,118 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern aes_gcm_enc_128_sse_nt +extern aes_gcm_enc_128_avx_gen4_nt +extern aes_gcm_enc_128_avx_gen2_nt +extern aes_gcm_enc_128_update_sse_nt +extern aes_gcm_enc_128_update_avx_gen4_nt +extern aes_gcm_enc_128_update_avx_gen2_nt + +extern aes_gcm_dec_128_sse_nt +extern aes_gcm_dec_128_avx_gen4_nt +extern aes_gcm_dec_128_avx_gen2_nt +extern aes_gcm_dec_128_update_sse_nt +extern aes_gcm_dec_128_update_avx_gen4_nt +extern aes_gcm_dec_128_update_avx_gen2_nt + +extern aes_gcm_enc_256_sse_nt +extern aes_gcm_enc_256_avx_gen4_nt +extern aes_gcm_enc_256_avx_gen2_nt +extern aes_gcm_enc_256_update_sse_nt +extern aes_gcm_enc_256_update_avx_gen4_nt +extern aes_gcm_enc_256_update_avx_gen2_nt + +extern aes_gcm_dec_256_sse_nt +extern aes_gcm_dec_256_avx_gen4_nt +extern aes_gcm_dec_256_avx_gen2_nt +extern aes_gcm_dec_256_update_sse_nt +extern aes_gcm_dec_256_update_avx_gen4_nt +extern aes_gcm_dec_256_update_avx_gen2_nt + +%if (AS_FEATURE_LEVEL) >= 10 +extern aes_gcm_enc_128_update_vaes_avx512_nt +extern aes_gcm_dec_128_update_vaes_avx512_nt +extern aes_gcm_enc_128_vaes_avx512_nt +extern aes_gcm_dec_128_vaes_avx512_nt + +extern aes_gcm_enc_256_update_vaes_avx512_nt +extern aes_gcm_dec_256_update_vaes_avx512_nt +extern aes_gcm_enc_256_vaes_avx512_nt +extern aes_gcm_dec_256_vaes_avx512_nt +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate aes_gcm NT interfaces enc, enc_update, dec, dec_update +;;;; +mbin_interface aes_gcm_enc_128_nt +mbin_dispatch_init7 aes_gcm_enc_128_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_avx_gen2_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_vaes_avx512_nt + +mbin_interface aes_gcm_enc_128_update_nt +mbin_dispatch_init7 aes_gcm_enc_128_update_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_avx_gen2_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_vaes_avx512_nt + +mbin_interface aes_gcm_dec_128_nt +mbin_dispatch_init7 aes_gcm_dec_128_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_avx_gen2_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_vaes_avx512_nt + +mbin_interface aes_gcm_dec_128_update_nt +mbin_dispatch_init7 aes_gcm_dec_128_update_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_avx_gen2_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_vaes_avx512_nt + +;;;; +; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp +;;;; +mbin_interface aes_gcm_enc_256_nt +mbin_dispatch_init7 aes_gcm_enc_256_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_avx_gen2_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_vaes_avx512_nt + +mbin_interface aes_gcm_enc_256_update_nt +mbin_dispatch_init7 aes_gcm_enc_256_update_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_avx_gen2_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_vaes_avx512_nt + +mbin_interface aes_gcm_dec_256_nt +mbin_dispatch_init7 aes_gcm_dec_256_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_avx_gen2_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_vaes_avx512_nt + +mbin_interface aes_gcm_dec_256_update_nt +mbin_dispatch_init7 aes_gcm_dec_256_update_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_avx_gen2_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_vaes_avx512_nt + + +;;; func core, ver, snum +slversion aes_gcm_enc_128_nt, 00, 00, 02e1 +slversion aes_gcm_dec_128_nt, 00, 00, 02e2 +slversion aes_gcm_enc_128_update_nt, 00, 00, 02e3 +slversion aes_gcm_dec_128_update_nt, 00, 00, 02e4 +slversion aes_gcm_enc_256_nt, 00, 00, 02e5 +slversion aes_gcm_dec_256_nt, 00, 00, 02e6 +slversion aes_gcm_enc_256_update_nt, 00, 00, 02e7 +slversion aes_gcm_dec_256_update_nt, 00, 00, 02e8 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c new file mode 100644 index 000000000..529d36b31 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c @@ -0,0 +1,2038 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> // for memcmp +#include <aes_gcm.h> +#include <openssl/sha.h> +#include "gcm_vectors.h" +#include "ossl_helper.h" +#include "types.h" + +//#define GCM_VECTORS_VERBOSE +//#define GCM_VECTORS_EXTRA_VERBOSE +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 200 +#endif +#ifndef TEST_LEN +# define TEST_LEN 32*1024 +#endif +#ifndef PAGE_LEN +# define PAGE_LEN (4*1024) +#endif + +// NT versions require 64B alignment +# define NT_ALIGNMENT (64) +# define ALIGNMENT_MASK (~(NT_ALIGNMENT - 1)) +# define OFFSET_BASE_VALUE (NT_ALIGNMENT) +#ifndef MAX_UNALIGNED +# define MAX_UNALIGNED (1) +#endif + +void dump_table(char *title, uint8_t * table, uint8_t count) +{ + int i; + char const *space = " "; + + printf("%s%s => {\n", space, title); + for (i = 0; i < count; i++) { + if (0 == (i & 15)) + printf("%s%s", space, space); + printf("%2x, ", table[i]); + if (15 == (i & 15)) + printf("\n"); + + } + printf("%s}\n", space); +} + +void dump_gcm_data(struct gcm_key_data *gkey) +{ +#ifdef GCM_VECTORS_EXTRA_VERBOSE + printf("gcm_data {\n"); + dump_table("expanded_keys", gkey->expanded_keys, (16 * 11)); + dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16); + dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16); + dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16); + dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16); + dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16); + dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16); + dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16); + dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16); + dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16); + dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16); + dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16); + dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16); + dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16); + dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16); + dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16); + dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16); + printf("}\n"); +#endif //GCM_VECTORS_VERBOSE +} + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128_nt(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i, ret; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 8) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, (i - last_break)); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i = (rand() % test_len / 8) & ALIGNMENT_MASK; + + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (rand() % (test_len / 64) == 0) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + free(rand_data); + + return OK; +} + +int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int length, int start, int breaks) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = length; + uint8_t *rand_data = NULL; + int ret; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + ret = posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = length; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(rand_data); + if (vector->Plen != 0) { + free(pt_test); + free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = 1; + uint8_t *rand_data = NULL; + uint64_t length; + int ret; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + ret = posix_memalign((void **)&stream, 64, PAGE_LEN); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break, + i - last_break); + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + ret = posix_memalign((void **)&stream, 64, PAGE_LEN); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break, + i - last_break); + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + free(rand_data); + + return OK; +} + +int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_256_nt(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256_nt(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i, ret; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + ret = posix_memalign((void **)&pt_test, 64, vector->Plen); + ret |= posix_memalign((void **)&ct_test, 64, vector->Plen); + ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen); + if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL) + || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 8) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->P + last_break, i - last_break); + } + + aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i += (rand() % test_len / 8) & ALIGNMENT_MASK; + + } + aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i += (rand() % test_len / 8) & ALIGNMENT_MASK; + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + ret = posix_memalign((void **)&stream, 64, i - last_break); + if ((ret != 0) || (stream == NULL)) { + OK = 1; + fprintf(stderr, "posix_memalign failed\n"); + break; + } + memcpy(stream, vector->C + last_break, i - last_break); + } + + aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + aligned_free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + i += (rand() % test_len / 8) & ALIGNMENT_MASK; + + } + aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + if (vector->Plen != 0) { + aligned_free(pt_test); + aligned_free(ct_test); + aligned_free(o_ct_test); + } + + return OK; +} + +int test_gcm_strm_efence(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random efence test vectors with random stream:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector_efence(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 64); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = 0; // (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +int test_gcm_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 64); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +// +// place all data to end at a page boundary to check for read past the end +// +int test_gcm_efence(void) +{ + gcm_vector test; + int offset = 0; + gcm_key_size key_len; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + uint8_t *P = NULL, *C = NULL, *K, *IV, *A, *T; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + ret = posix_memalign((void **)&P, 64, PAGE_LEN); + ret |= posix_memalign((void **)&C, 64, PAGE_LEN); + K = malloc(PAGE_LEN); + IV = malloc(PAGE_LEN); + A = malloc(PAGE_LEN); + T = malloc(PAGE_LEN); + if ((0 != ret) || (NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) + || (NULL == A) || (NULL == T) || (NULL == gkey) || (NULL == gctx)) { + printf("malloc of testsize:0x%x failed\n", PAGE_LEN); + return -1; + } + + test.Plen = PAGE_LEN / 2; + // place buffers to end at page boundary + test.IVlen = GCM_IV_DATA_LEN; + test.Alen = test.Plen; + test.Tlen = MAX_TAG_LEN; + + printf("AES GCM efence test vectors:"); + for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len; + key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) { + test.Klen = key_len; + for (offset = 0; MAX_UNALIGNED > offset; offset++) { + if (0 == (offset % 80)) + printf("\n"); + // move the start and size of the data block towards the end of the page + test.Plen = (PAGE_LEN / 2) - offset; + test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes + //Place data at end of page + test.P = P + PAGE_LEN - test.Plen; + test.C = C + PAGE_LEN - test.Plen; + test.K = K + PAGE_LEN - test.Klen; + test.IV = IV + PAGE_LEN - test.IVlen; + test.A = A + PAGE_LEN - test.Alen; + test.T = T + PAGE_LEN - test.Tlen; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + if (GCM_128_KEY_LEN == key_len) { + if (0 != check_vector(gkey, gctx, &test)) + return 1; + } else { + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + } + } + } + free(gkey); + free(gctx); + free(P); + free(C); + free(K); + free(IV); + free(A); + free(T); + + printf("\n"); + return 0; +} + +int test_gcm128_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-128:\n"); +#endif + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 64, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 64, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int ret; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-256:\n"); +#endif + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 64, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 64, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)"); + OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)"); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm_std_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors:\n"); + for (vect = 0; vect < vectors_cnt; vect++) { +#ifdef GCM_VECTORS_VERBOSE + printf + ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + + if (BITS_128 == gcm_vectors[vect].Klen) { + OK |= test_gcm128_std_vectors(&gcm_vectors[vect]); + } else { + OK |= test_gcm256_std_vectors(&gcm_vectors[vect]); + } + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +// The length of the data is set to length. The first stream is from 0 to start. After +// that the data is broken into breaks chunks of equal size (except possibly the last +// one due to divisibility). +int test_gcm_strm_combinations2(int length, int start, int breaks) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + int ret; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors of length %d and stream with %d breaks:", length, + breaks + 1); + for (t = 0; RANDOMS > t; t++) { + int Plen = length; + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + ret = posix_memalign((void **)&test.P, 64, test.Plen + offset); + ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + ret = posix_memalign((void **)&test.P, 64, 16); + ret |= posix_memalign((void **)&test.C, 64, 16); + } + if (ret != 0) { + printf("posix_memalign for testsize:0x%x failed\n", Plen); + return 1; + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + aligned_free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + aligned_free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors(); + errors += test_gcm256_combinations(); + errors += test_gcm_combinations(); + errors += test_gcm_efence(); + errors += test_gcm256_strm_combinations(TEST_LEN); + errors += test_gcm_strm_combinations(TEST_LEN); + errors += test_gcm256_strm_combinations(1024); + errors += test_gcm_strm_combinations(1024); + errors += test_gcm_strm_efence(); + errors += test_gcm_strm_combinations2(1024, 0, 1024); + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c new file mode 100644 index 000000000..19c0cc447 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c @@ -0,0 +1,322 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> // for memcmp +#include <aes_gcm.h> +#include "gcm_vectors.h" +#include "types.h" + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int test_gcm128_std_vectors_nt(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int ret; + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 32, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 32, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + memset(pt_test, 0, vector->Plen); + + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors_nt(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int ret; + + // Allocate space for the calculated ciphertext + ret = posix_memalign((void **)&ct_test, 32, vector->Plen); + // Allocate space for the calculated plaintext + ret |= posix_memalign((void **)&pt_test, 32, vector->Plen); + if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + memset(pt_test, 0, vector->Plen); + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + if (NULL != ct_test) + aligned_free(ct_test); + if (NULL != pt_test) + aligned_free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm_std_vectors_nt(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors NT:\n"); + for (vect = 0; (vect < vectors_cnt); vect++) { +#ifdef DEBUG + printf("Standard vector NT %d/%d" + " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + if (BITS_128 == gcm_vectors[vect].Klen) + OK |= test_gcm128_std_vectors_nt(&gcm_vectors[vect]); + else + OK |= test_gcm256_std_vectors_nt(&gcm_vectors[vect]); + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors_nt(); + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c new file mode 100644 index 000000000..a9e9c5914 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c @@ -0,0 +1,272 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include <aes_gcm.h> +#include <test.h> +#include "ossl_helper.h" +#include "gcm_vectors.h" + +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define AAD_LENGTH 16 +#define TEST_MEM TEST_LEN + +static unsigned char *plaintext, *gcm_plaintext, *cyphertext, *ossl_plaintext, + *ossl_cyphertext, *gcm_tag, *ossl_tag, *IV, *AAD; +static uint8_t key128[GCM_128_KEY_LEN]; +static uint8_t key256[GCM_256_KEY_LEN]; +uint8_t iv_len = 0; + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + unsigned int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, int vect, char *data_name) +{ + int mismatch; + int OK = 1; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 0; + printf(" v[%d] expected results don't match %s \t\t", vect, data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +void aes_gcm_perf(void) +{ + struct gcm_key_data gkey, gkey256; + struct gcm_context_data gctx; + int i; + + printf + ("AES GCM performance parameters plain text length:%d; IV length:%d; ADD length:%d \n", + TEST_LEN, GCM_IV_LEN, AAD_LENGTH); + + mk_rand_data(key128, sizeof(key128)); + mk_rand_data(key256, sizeof(key256)); + + // This is only required once for a given key + aes_gcm_pre_128(key128, &gkey); + aes_gcm_pre_256(key256, &gkey256); + + // Preload code cache + aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH, + gcm_tag, MAX_TAG_LEN); + openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN, + plaintext, TEST_LEN, ossl_cyphertext); + check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0, + "ISA-L vs OpenSSL 128 key cypher text (C)"); + check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 128 tag (T)"); + aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH, + gcm_tag, MAX_TAG_LEN); + openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN, + plaintext, TEST_LEN, ossl_cyphertext); + check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0, + "ISA-L vs OpenSSL 256 cypher text (C)"); + check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 256 tag (T)"); + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, + AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + } + + perf_stop(&stop); + printf(" aes_gcm_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN, + cyphertext); + } + + perf_stop(&stop); + printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_dec_128(&gkey, &gctx, plaintext, cyphertext, TEST_LEN, IV, + AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, "ISA-L check of tag (T)"); + } + + perf_stop(&stop); + printf(" aes_gcm_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_gcm_dec(key128, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN, + plaintext); + } + + perf_stop(&stop); + printf("openssl_aes_gcm_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + printf("\n"); + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV, + AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + } + + perf_stop(&stop); + printf(" aes_gcm256_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN, + cyphertext); + } + + perf_stop(&stop); + printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + aes_gcm_dec_256(&gkey256, &gctx, plaintext, cyphertext, TEST_LEN, IV, + AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN); + check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, + "ISA-L check of 256 tag (T)"); + } + + perf_stop(&stop); + printf(" aes_gcm256_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } + { + struct perf start, stop; + + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) { + openssl_aes_256_gcm_dec(key256, IV, iv_len, AAD, AAD_LENGTH, + ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN, + plaintext); + } + + perf_stop(&stop); + printf("openssl_aes_256_gcm_dec" TEST_TYPE_STR ":\t"); + perf_print(stop, start, (long long)TEST_LEN * i); + } +} + +int main(void) +{ + uint8_t const IVend[] = GCM_IV_END_MARK; + uint32_t OK = 1; + + plaintext = malloc(TEST_LEN); + gcm_plaintext = malloc(TEST_LEN); + cyphertext = malloc(TEST_LEN); + ossl_plaintext = malloc(TEST_LEN + 16); + ossl_cyphertext = malloc(TEST_LEN); + gcm_tag = malloc(MAX_TAG_LEN); + ossl_tag = malloc(MAX_TAG_LEN); + AAD = malloc(AAD_LENGTH); + IV = malloc(GCM_IV_LEN); + if ((NULL == plaintext) || (NULL == cyphertext) || (NULL == gcm_plaintext) + || (NULL == ossl_plaintext) || (NULL == ossl_cyphertext) + || (NULL == gcm_tag) || (NULL == ossl_tag) || (NULL == AAD) || (NULL == IV)) { + printf("malloc of testsize:0x%x failed\n", TEST_LEN); + return -1; + } + + mk_rand_data(plaintext, TEST_LEN); + mk_rand_data(AAD, AAD_LENGTH); + mk_rand_data(IV, GCM_IV_LEN); + memcpy(&IV[GCM_IV_END_START], IVend, sizeof(IVend)); + iv_len = GCM_IV_LEN - sizeof(IVend); //end marker not part of IV length + + aes_gcm_perf(); + printf("AES gcm ISA-L vs OpenSSL performance\n"); + + return !OK; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c new file mode 100644 index 000000000..ee064ef6c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c @@ -0,0 +1,61 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <aes_gcm.h> +#include <aes_keyexp.h> + +void aes_keyexp_128_enc(const void *, uint8_t *); +void aes_gcm_precomp_128(struct gcm_key_data *key_data); +void aes_gcm_precomp_256(struct gcm_key_data *key_data); + +void aes_gcm_pre_128(const void *key, struct gcm_key_data *key_data) +{ + aes_keyexp_128_enc(key, key_data->expanded_keys); + aes_gcm_precomp_128(key_data); +} + +void aes_gcm_pre_256(const void *key, struct gcm_key_data *key_data) +{ + uint8_t tmp_exp_key[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; + aes_keyexp_256((const uint8_t *)key, (uint8_t *) key_data->expanded_keys, tmp_exp_key); + aes_gcm_precomp_256(key_data); +} + +struct slver { + uint16_t snum; + uint8_t ver; + uint8_t core; +}; + +// Version info +struct slver aes_gcm_pre_128_slver_000002c7; +struct slver aes_gcm_pre_128_slver = { 0x02c7, 0x00, 0x00 }; + +struct slver aes_gcm_pre_256_slver_000002d7; +struct slver aes_gcm_pre_256_slver = { 0x02d7, 0x00, 0x00 }; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c new file mode 100644 index 000000000..4b7ca9736 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c @@ -0,0 +1,78 @@ +/********************************************************************** + Copyright(c) 2011-2017 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include "aes_gcm.h" + +#define TXT_SIZE 8 +#define AAD_SIZE 32 +#define TAG_SIZE 16 /* Valid values are 16, 12, or 8 */ +#define KEY_SIZE GCM_256_KEY_LEN +#define IV_SIZE GCM_IV_DATA_LEN + +void mprint(const char *msg, uint8_t * buf, int len) +{ + int i; + printf("%s", msg); + for (i = 0; i < len;) { + printf(" %2x", 0xff & buf[i++]); + if (i % 32 == 0) + printf("\n"); + } + printf("\n"); +} + +int main(void) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + uint8_t ct[TXT_SIZE], pt[TXT_SIZE], pt2[TXT_SIZE]; // Cipher text and plain text + uint8_t iv[IV_SIZE], aad[AAD_SIZE], key[KEY_SIZE]; // Key and authentication data + uint8_t tag1[TAG_SIZE], tag2[TAG_SIZE]; // Authentication tags for encode and decode + + printf("gcm example:\n"); + memset(key, 0, KEY_SIZE); + memset(pt, 0, TXT_SIZE); + memset(iv, 0, IV_SIZE); + memset(aad, 0, AAD_SIZE); + + aes_gcm_pre_256(key, &gkey); + aes_gcm_enc_256(&gkey, &gctx, ct, pt, TXT_SIZE, iv, aad, AAD_SIZE, tag1, TAG_SIZE); + aes_gcm_dec_256(&gkey, &gctx, pt2, ct, TXT_SIZE, iv, aad, AAD_SIZE, tag2, TAG_SIZE); + + mprint(" input text: ", pt, TXT_SIZE); + mprint(" cipher text: ", ct, TXT_SIZE); + mprint(" decode text: ", pt2, TXT_SIZE); + mprint(" ath tag1 (enc): ", tag1, TAG_SIZE); + mprint(" ath tag2 (dec): ", tag2, TAG_SIZE); + + return memcmp(tag1, tag2, TAG_SIZE); +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm new file mode 100644 index 000000000..e35860496 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm @@ -0,0 +1,2171 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2017 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_sse.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION +%define NROUNDS 13 +%endif + + +default rel +; need to push 5 registers into stack to maintain +%define STACK_OFFSET 8*5 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqa %%T1, %%GH + pshufd %%T2, %%GH, 01001110b + pshufd %%T3, %%HK, 01001110b + pxor %%T2, %%GH ; %%T2 = (a1+a0) + pxor %%T3, %%HK ; %%T3 = (b1+b0) + + pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T2, %%GH + pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%GH, %%T3 + pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK + + + ;first phase of the reduction + movdqa %%T2, %%GH + movdqa %%T3, %%GH + movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%GH + movdqa %%T4,%%GH + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T5 + pxor %%GH, %%T2 + pxor %%GH, %%T1 ; the result is in %%T1 + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + movdqa %%T4, %%HK + pshufd %%T1, %%HK, 01001110b + pxor %%T1, %%HK + movdqu [%%GDATA + HashKey_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly + movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly + movdqu [%%GDATA + HashKey_3], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_3_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly + movdqu [%%GDATA + HashKey_4], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly + movdqu [%%GDATA + HashKey_5], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_5_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly + movdqu [%%GDATA + HashKey_6], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly + movdqu [%%GDATA + HashKey_7], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly + movdqu [%%GDATA + HashKey_8], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_8_k], %%T1 + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + pxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + pinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + pinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + pxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + movdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + mov r13, [%%GDATA_CTX + PBlockLen] + +%%_data_read: ;Finished reading in data + + + movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key + movdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + movdqa xmm3, xmm1 + pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pand xmm3, xmm1 + pshufb xmm3, [SHUF_MASK] + pshufb xmm3, xmm2 + pxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_dec_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pshufb xmm9, [SHUF_MASK] + pshufb xmm9, xmm2 + pxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%%_encode_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + pshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + + ; start AES for %%num_initial_blocks blocks + movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + paddd %%CTR, [ONE] ; INCR Y0 + movdqa reg(i), %%CTR + pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +movdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + pxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192) +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + pxor reg(i), %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + movdqa reg(i), %%T1 + %endif + pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + pxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + movdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM1, %%CTR + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM2, %%CTR + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM3, %%CTR + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM4, %%CTR + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM5, %%CTR + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM6, %%CTR + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM7, %%CTR + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM8, %%CTR + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + movdqu %%T_key, [%%GDATA_KEY+16*0] + pxor %%XMM1, %%T_key + pxor %%XMM2, %%T_key + pxor %%XMM3, %%T_key + pxor %%XMM4, %%T_key + pxor %%XMM5, %%T_key + pxor %%XMM6, %%T_key + pxor %%XMM7, %%T_key + pxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS ; do early (13) rounds (11 for GCM192) + movdqu %%T_key, [%%GDATA_KEY+16*i] + aesenc %%XMM1, %%T_key + aesenc %%XMM2, %%T_key + aesenc %%XMM3, %%T_key + aesenc %%XMM4, %%T_key + aesenc %%XMM5, %%T_key + aesenc %%XMM6, %%T_key + aesenc %%XMM7, %%T_key + aesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round + aesenclast %%XMM1, %%T_key + aesenclast %%XMM2, %%T_key + aesenclast %%XMM3, %%T_key + aesenclast %%XMM4, %%T_key + aesenclast %%XMM5, %%T_key + aesenclast %%XMM6, %%T_key + aesenclast %%XMM7, %%T_key + aesenclast %%XMM8, %%T_key + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + pxor %%XMM1, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM1, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + pxor %%XMM2, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM2, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + pxor %%XMM3, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM3, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + pxor %%XMM4, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM4, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + pxor %%XMM5, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM5, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + pxor %%XMM6, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM6, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + pxor %%XMM7, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM7, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + pxor %%XMM8, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + movdqa %%T7, %%XMM1 + movdqu [rsp + TMP2], %%XMM2 + movdqu [rsp + TMP3], %%XMM3 + movdqu [rsp + TMP4], %%XMM4 + movdqu [rsp + TMP5], %%XMM5 + movdqu [rsp + TMP6], %%XMM6 + movdqu [rsp + TMP7], %%XMM7 + movdqu [rsp + TMP8], %%XMM8 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + + movdqa %%T4, %%T7 + pshufd %%T6, %%T7, 01001110b + pxor %%T6, %%T7 + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + %else + paddd %%CTR, [ONEf] ; INCR CNT + %endif + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_8_k] + pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + movdqa %%XMM1, %%CTR + + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM8, %%CTR + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + %else + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM8, %%CTR + %endif + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu %%T1, [%%GDATA + 16*0] + pxor %%XMM1, %%T1 + pxor %%XMM2, %%T1 + pxor %%XMM3, %%T1 + pxor %%XMM4, %%T1 + pxor %%XMM5, %%T1 + pxor %%XMM6, %%T1 + pxor %%XMM7, %%T1 + pxor %%XMM8, %%T1 + + ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP2] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*1] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + movdqu %%T1, [%%GDATA + 16*2] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqu %%T1, [rsp + TMP3] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*3] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP4] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*4] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*5] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP5] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + + movdqu %%T1, [%%GDATA + 16*6] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + movdqu %%T1, [rsp + TMP6] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*7] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP7] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*8] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP8] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T7, %%T3 + pxor %%T4, %%T1 + + movdqu %%T1, [%%GDATA + 16*9] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + +%ifdef GCM128_MODE + movdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*12] ; finish last key round +%endif +%ifdef GCM256_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*12] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*13] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*14] ; finish last key round +%endif + +%assign i 0 +%assign j 1 +%rep 8 + XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + +%ifidn %%ENC_DEC, DEC + movdqa %%T3, %%T1 +%endif + + pxor %%T1, %%T5 + aesenclast reg(j), %%T1 ; XMM1:XMM8 + XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer + +%ifidn %%ENC_DEC, DEC + movdqa reg(j), %%T3 +%endif +%assign i (i+1) +%assign j (j+1) +%endrep + + + + + pxor %%T2, %%T6 + pxor %%T2, %%T4 + pxor %%T2, %%T7 + + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T3 + pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7 + + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T1, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T1 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T1,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T1,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T1 + + pxor %%T2, %%T5 + pxor %%T7, %%T2 + pxor %%T7, %%T4 ; the result is in %%T4 + + + pxor %%XMM1, %%T7 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ; Karatsuba Method + movdqa %%T6, %%XMM1 + pshufd %%T2, %%XMM1, 01001110b + pxor %%T2, %%XMM1 + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1 + + pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_8_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + movdqa %%T7, %%XMM1 + movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM2 + pshufd %%T2, %%XMM2, 01001110b + pxor %%T2, %%XMM2 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM2 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM3 + pshufd %%T2, %%XMM3, 01001110b + pxor %%T2, %%XMM3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM3 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM4 + pshufd %%T2, %%XMM4, 01001110b + pxor %%T2, %%XMM4 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM4 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM5 + pshufd %%T2, %%XMM5, 01001110b + pxor %%T2, %%XMM5 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM5 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM6 + pshufd %%T2, %%XMM6, 01001110b + pxor %%T2, %%XMM6 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM6 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM7 + pshufd %%T2, %%XMM7, 01001110b + pxor %%T2, %%XMM7 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM7 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM8 + pshufd %%T2, %%XMM8, 01001110b + pxor %%T2, %%XMM8 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM8 + pxor %%T2, %%XMM1 + pxor %%T2, %%T6 + pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm + + + movdqa %%T4, %%T2 + pslldq %%T4, 8 ; shift-L %%T4 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T4 + pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T1, %%T2 + psrldq %%T1, 4 ; shift-R %%T1 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T4,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T1 + pxor %%T7, %%T2 + pxor %%T6, %%T7 ; the result is in %%T6 + +%endmacro + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 3 +%define %%GDATA %1 +%define %%ST %2 +%define %%T1 %3 + movdqu %%T1, [%%GDATA+16*0] + pxor %%ST, %%T1 +%assign i 1 +%rep NROUNDS + movdqu %%T1, [%%GDATA+16*i] + aesenc %%ST, %%T1 +%assign i (i+1) +%endrep + movdqu %%T1, [%%GDATA+16*i] + aesenclast %%ST, %%T1 +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + push rsi + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 + + mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5] +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13 and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + movdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + pxor xmm2, xmm3 + mov r10, %%A_LEN + + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + pinsrq xmm2, [r10], 0 + pinsrd xmm2, [r10+8], 2 + movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + pshufb xmm2, [SHUF_MASK] + + movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data +; struct has been initialized by GCM_INIT. +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed + movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + movdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext + sub r13, %%DATA_OFFSET + mov r10, r13 ;save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + movd r15d, xmm9 + and r15d, 255 + pshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + pshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + pshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + movdqu [%%GDATA_CTX + AadHash], xmm14 + movdqu [%%GDATA_CTX + CurCount], xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + paddd xmm9, [ONE] ; INCR CNT to get Yn + movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9 + pshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn) + movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: + %ifidn %%ENC_DEC, DEC + movdqa xmm2, xmm1 + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pand xmm2, xmm1 + pshufb xmm2, [SHUF_MASK] + pxor xmm14, xmm2 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + %else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pshufb xmm9, [SHUF_MASK] + pxor xmm14, xmm9 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + %endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes) + movdqu xmm14, [%%GDATA_CTX + AadHash] + movdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + movdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + movd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + movq xmm1, %%PLAIN_CYPH_LEN + pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + pxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + pxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0) + + pxor xmm9, xmm14 + + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + movq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + movq rax, xmm9 + mov [r10], rax + psrldq xmm9, 8 + movd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + movdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ;GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + pxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey + + pshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + movdqa xmm2, xmm6 + psllq xmm6, 1 + psrlq xmm2, 63 + movdqa xmm1, xmm2 + pslldq xmm2, 8 + psrldq xmm1, 8 + por xmm6, xmm2 + ;reduction + pshufd xmm2, xmm1, 00100100b + pcmpeqd xmm2, [TWOONE] + pand xmm2, [POLY] + pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 +ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + push arg5 + sub rsp, 1*16 + movdqu [rsp + 0*16],xmm6 + mov arg5, [rsp + 1*16 + 8*3 + 8*5] +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop arg5 +%endif + pop r13 + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c new file mode 100644 index 000000000..b0a6221d5 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c @@ -0,0 +1,1940 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> // for memcmp +#include <aes_gcm.h> +#include <openssl/sha.h> +#include "gcm_vectors.h" +#include "ossl_helper.h" +#include "types.h" + +//#define GCM_VECTORS_VERBOSE +//#define GCM_VECTORS_EXTRA_VERBOSE +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 200 +#endif +#ifndef TEST_LEN +# define TEST_LEN 32*1024 +#endif +#ifndef PAGE_LEN +# define PAGE_LEN (4*1024) +#endif + +#if defined(NT_LD) || defined(NT_ST) || defined(NT_LDST) +# define ALIGNMENT_MASK (~15) +# define OFFSET_BASE_VALUE 16 +#ifndef MAX_UNALIGNED +# define MAX_UNALIGNED (1) +#endif +#else +# define ALIGNMENT_MASK (~0) +# define OFFSET_BASE_VALUE 1 +#ifndef MAX_UNALIGNED +# define MAX_UNALIGNED (16) +#endif +#endif + +void dump_table(char *title, uint8_t * table, uint8_t count) +{ + int i; + char const *space = " "; + + printf("%s%s => {\n", space, title); + for (i = 0; i < count; i++) { + if (0 == (i & 15)) + printf("%s%s", space, space); + printf("%2x, ", table[i]); + if (15 == (i & 15)) + printf("\n"); + + } + printf("%s}\n", space); +} + +void dump_gcm_data(struct gcm_key_data *gkey) +{ +#ifdef GCM_VECTORS_EXTRA_VERBOSE + printf("gcm_data {\n"); + dump_table("expanded_keys", gkey->expanded_keys, (16 * 11)); + dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16); + dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16); + dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16); + dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16); + dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16); + dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16); + dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16); + dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16); + dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16); + dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16); + dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16); + dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16); + dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16); + dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16); + dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16); + dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16); + printf("}\n"); +#endif //GCM_VECTORS_VERBOSE +} + +void mk_rand_data(uint8_t * data, uint32_t size) +{ + int i; + for (i = 0; i < size; i++) { + *data++ = rand(); + } +} + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + + return OK; +} + +int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 32) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i = (rand() % test_len / 32) & ALIGNMENT_MASK; + + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (rand() % (test_len / 64) == 0) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + free(rand_data); + + return OK; +} + +int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int length, int start, int breaks) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = length; + uint8_t *rand_data = NULL; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->P + last_break, i - last_break); + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = length; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->C + last_break, i - last_break); + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + last_break = i; + i = i + (length - start) / breaks; + + } + + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(rand_data); + + return OK; +} + +int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break = 0; + int i = 1; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + stream = malloc(PAGE_LEN); + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break, + i - last_break); + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen); + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i = 0; + aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < vector->Plen) { + if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) { + stream = malloc(PAGE_LEN); + i = i & ALIGNMENT_MASK; + memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break, + i - last_break); + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, + stream + PAGE_LEN - (i - last_break), + i - last_break); + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + + } + if (rand() % 1024 != 0) + i++; + + } + aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + free(rand_data); + + return OK; +} + +int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_256(gkey, gctx, vector->C, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256(gkey, gctx, vector->P, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + + return OK; +} + +int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, + gcm_vector * vector, int test_len) +{ + uint8_t *pt_test = NULL; + uint8_t *ct_test = NULL; + uint8_t *o_ct_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *o_T_test = NULL; + uint8_t *stream = NULL; + uint64_t IV_alloc_len = 0; + int result; + int OK = 0; + uint32_t last_break; + int i; + uint8_t *rand_data = NULL; + uint64_t length; + + rand_data = malloc(100); + +#ifdef GCM_VECTORS_VERBOSE + printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + (int)vector->Klen, + (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen); +#else + printf("."); +#endif + // Allocate space for the calculated ciphertext + if (vector->Plen != 0) { + pt_test = malloc(vector->Plen); + ct_test = malloc(vector->Plen); + o_ct_test = malloc(vector->Plen); + if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + o_T_test = malloc(vector->Tlen); + if ((T_test == NULL) || (o_T_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + + last_break = 0; + i = (rand() % test_len / 32) & ALIGNMENT_MASK; + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->P + last_break, i - last_break); + } + + aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + last_break = i; + i += (rand() % test_len / 32) & ALIGNMENT_MASK; + + } + aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, vector->P + last_break, + vector->Plen - last_break); + if (gctx->in_length != vector->Plen) + printf("%lu, %lu\n", gctx->in_length, vector->Plen); + aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, o_T_test, + vector->Tlen, vector->P, vector->Plen, o_ct_test); + OK |= + check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)"); + + memcpy(ct_test, vector->C, vector->Plen); + memcpy(pt_test, vector->P, vector->Plen); + memset(vector->P, 0, vector->Plen); + memcpy(T_test, vector->T, vector->Tlen); + memset(vector->T, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + + last_break = 0; + i += (rand() % test_len / 32) & ALIGNMENT_MASK; + aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen); + while (i < (vector->Plen)) { + if (i - last_break != 0) { + stream = malloc(i - last_break); + memcpy(stream, vector->C + last_break, i - last_break); + } + + aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, stream, + i - last_break); + if (i - last_break != 0) + free(stream); + + if (rand() % 1024 == 0) { + length = rand() % 100; + + mk_rand_data(rand_data, length); + SHA1(rand_data, length, rand_data); + } + + last_break = i; + i += (rand() % test_len / 32) & ALIGNMENT_MASK; + + } + aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, vector->C + last_break, + vector->Plen - last_break); + aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen); + + OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)"); + OK |= + check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)"); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted ISA-L plain text (P)"); + memset(vector->P, 0, vector->Plen); + aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen, + IV_c, vector->A, vector->Alen, vector->T, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L decrypted OpenSSL plain text (P)"); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, vector->Alen, + vector->T, vector->Tlen, vector->C, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + free(T_test); + free(o_T_test); + free(IV_c); + free(pt_test); + free(ct_test); + free(o_ct_test); + + return OK; +} + +int test_gcm_strm_efence(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random efence test vectors with random stream:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector_efence(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 16); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = 0; // (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +int test_gcm_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_combinations(void) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors:"); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % TEST_LEN); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int test_gcm256_strm_combinations(int test_len) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + uint8_t *gkeytemp = NULL; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkeytemp = malloc(sizeof(struct gcm_key_data) + 16); + gctx = malloc(sizeof(struct gcm_context_data)); + gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES-GCM-256 random test vectors with random stream of average size %d:", + test_len / 64); + for (t = 0; RANDOMS > t; t++) { + int Plen = (rand() % test_len); + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % test_len); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_256_KEY_LEN + offset); + test.Klen = GCM_256_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_256_strm_vector(gkey, gctx, &test, test_len)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkeytemp); + free(gctx); + return 0; +} + +// +// place all data to end at a page boundary to check for read past the end +// +int test_gcm_efence(void) +{ + gcm_vector test; + int offset = 0; + gcm_key_size key_len; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + uint8_t *P, *C, *K, *IV, *A, *T; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + P = malloc(PAGE_LEN); + C = malloc(PAGE_LEN); + K = malloc(PAGE_LEN); + IV = malloc(PAGE_LEN); + A = malloc(PAGE_LEN); + T = malloc(PAGE_LEN); + if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) || (NULL == A) + || (NULL == T) || (NULL == gkey) || (NULL == gctx)) { + printf("malloc of testsize:0x%x failed\n", PAGE_LEN); + return -1; + } + + test.Plen = PAGE_LEN / 2; + // place buffers to end at page boundary + test.IVlen = GCM_IV_DATA_LEN; + test.Alen = test.Plen; + test.Tlen = MAX_TAG_LEN; + + printf("AES GCM efence test vectors:"); + for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len; + key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) { + test.Klen = key_len; + for (offset = 0; MAX_UNALIGNED > offset; offset++) { + if (0 == (offset % 80)) + printf("\n"); + // move the start and size of the data block towards the end of the page + test.Plen = (PAGE_LEN / 2) - offset; + test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes + //Place data at end of page + test.P = P + PAGE_LEN - test.Plen; + test.C = C + PAGE_LEN - test.Plen; + test.K = K + PAGE_LEN - test.Klen; + test.IV = IV + PAGE_LEN - test.IVlen; + test.A = A + PAGE_LEN - test.Alen; + test.T = T + PAGE_LEN - test.Tlen; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + if (GCM_128_KEY_LEN == key_len) { + if (0 != check_vector(gkey, gctx, &test)) + return 1; + } else { + if (0 != check_256_vector(gkey, gctx, &test)) + return 1; + } + } + } + free(gkey); + free(gctx); + free(P); + free(C); + free(K); + free(IV); + free(A); + free(T); + + printf("\n"); + return 0; +} + +int test_gcm128_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-128:\n"); +#endif + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + if (ct_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->Plen); + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + int result; + +#ifdef GCM_VECTORS_VERBOSE + printf("AES-GCM-256:\n"); +#endif + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the calculated ciphertext + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the calculated ciphertext + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); +#ifdef GCM_VECTORS_VERBOSE + dump_gcm_data(&gkey); +#endif + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, pt_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)"); + OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)"); + OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)"); + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + // OpenSSl enc -> ISA-L dec + openssl_aes_256_gcm_enc(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + vector->P, vector->Plen, ct_test); + OK |= + check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)"); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "OpenSSL->ISA-L decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)"); + // ISA-L enc -> OpenSSl dec + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + result = + openssl_aes_256_gcm_dec(vector->K, vector->IV, + vector->IVlen, vector->A, + vector->Alen, T_test, vector->Tlen, + ct_test, vector->Plen, pt_test); + if (-1 == result) + printf(" ISA-L->OpenSSL decryption failed Authentication\n"); + OK |= (-1 == result); + OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)"); + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm_std_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors:\n"); + for (vect = 0; vect < vectors_cnt; vect++) { +#ifdef GCM_VECTORS_VERBOSE + printf + ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + + if (BITS_128 == gcm_vectors[vect].Klen) { + OK |= test_gcm128_std_vectors(&gcm_vectors[vect]); + } else { + OK |= test_gcm256_std_vectors(&gcm_vectors[vect]); + } + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +// The length of the data is set to length. The first stream is from 0 to start. After +// that the data is broken into breaks chunks of equal size (except possibly the last +// one due to divisibility). +int test_gcm_strm_combinations2(int length, int start, int breaks) +{ + gcm_vector test; + int tag_len = 8; + int t = 0; + struct gcm_key_data *gkey = NULL; + struct gcm_context_data *gctx = NULL; + + gkey = malloc(sizeof(struct gcm_key_data)); + gctx = malloc(sizeof(struct gcm_context_data)); + if (NULL == gkey || NULL == gctx) + return 1; + + printf("AES GCM random test vectors of length %d and stream with %d breaks:", length, + breaks + 1); + for (t = 0; RANDOMS > t; t++) { + int Plen = length; + //lengths must be a multiple of 4 bytes + int aad_len = (rand() % TEST_LEN); + int offset = (rand() % MAX_UNALIGNED); + if (offset == 0 && aad_len == 0) + offset = OFFSET_BASE_VALUE; + + if (0 == (t % 25)) + printf("\n"); + if (0 == (t % 10)) + fflush(0); + test.P = NULL; + test.C = NULL; + test.A = NULL; + test.T = NULL; + test.Plen = Plen; + if (test.Plen + offset != 0) { + test.P = malloc(test.Plen + offset); + test.C = malloc(test.Plen + offset); + } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers + test.P = malloc(16); + test.C = malloc(16); + } + test.K = malloc(GCM_128_KEY_LEN + offset); + test.Klen = GCM_128_KEY_LEN; + test.IV = malloc(GCM_IV_DATA_LEN + offset); + test.IVlen = GCM_IV_DATA_LEN; + test.A = malloc(aad_len + offset); + + test.Alen = aad_len; + test.T = malloc(MAX_TAG_LEN + offset); + + if ((NULL == test.P && test.Plen != 0) || (NULL == test.K) + || (NULL == test.IV)) { + printf("malloc of testsize:0x%x failed\n", Plen); + return 1; + } + + test.P += offset; + test.C += offset; + test.K += offset; + test.IV += offset; + test.A += offset; + test.T += offset; + + mk_rand_data(test.P, test.Plen); + mk_rand_data(test.K, test.Klen); + mk_rand_data(test.IV, test.IVlen); + mk_rand_data(test.A, test.Alen); + + // single Key length of 128bits/16bytes supported + // single IV length of 96bits/12bytes supported + // Tag lengths of 8, 12 or 16 + for (tag_len = 8; tag_len <= MAX_TAG_LEN;) { + test.Tlen = tag_len; + if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks)) + return 1; + tag_len += 4; //supported lengths are 8, 12 or 16 + } + test.A -= offset; + free(test.A); + test.C -= offset; + free(test.C); + test.IV -= offset; + free(test.IV); + test.K -= offset; + free(test.K); + test.P -= offset; + free(test.P); + test.T -= offset; + free(test.T); + } + printf("\n"); + free(gkey); + free(gctx); + return 0; +} + +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors(); + errors += test_gcm256_combinations(); + errors += test_gcm_combinations(); + errors += test_gcm_efence(); + errors += test_gcm256_strm_combinations(TEST_LEN); + errors += test_gcm_strm_combinations(TEST_LEN); + errors += test_gcm256_strm_combinations(1024); + errors += test_gcm_strm_combinations(1024); + errors += test_gcm_strm_efence(); + errors += test_gcm_strm_combinations2(1024, 0, 1024); + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c new file mode 100644 index 000000000..54581d6b6 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c @@ -0,0 +1,659 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> // for memcmp +#include <aes_gcm.h> +#include "gcm_vectors.h" +#include "types.h" + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif + +int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name) +{ + int mismatch; + int OK = 0; + + mismatch = memcmp(test, expected, len); + if (mismatch) { + OK = 1; + printf(" expected results don't match %s \t\t", data_name); + { + uint64_t a; + for (a = 0; a < len; a++) { + if (test[a] != expected[a]) { + printf(" '%x' != '%x' at %lx of %lx\n", + test[a], expected[a], a, len); + break; + } + } + } + } + return OK; +} + +int test_gcm128_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_128(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + memset(pt_test, 0, vector->Plen); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +int test_gcm256_std_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +void aes_gcm_stream_enc_128(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_128(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in verctors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + + aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_enc_128_finalize(key_data, context, auth_tag, auth_tag_len); +} + +void aes_gcm_stream_dec_128(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_128(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_dec_128_finalize(key_data, context, auth_tag, auth_tag_len); + +} + +#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST) +int test_gcm128_std_stream_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if ((T_test == NULL) || (T2_test == NULL)) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + memset(gkey.expanded_keys, 0, sizeof(gkey.expanded_keys)); + aes_gcm_pre_128(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + + aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_stream_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_stream_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + memset(pt_test, 0, vector->Plen); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} + +void aes_gcm_stream_enc_256(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_256(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + + aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_enc_256_finalize(key_data, context, auth_tag, auth_tag_len); + +} + +void aes_gcm_stream_dec_256(const struct gcm_key_data *key_data, + struct gcm_context_data *context, + uint8_t * out, + uint8_t const *in, + uint64_t len, + uint8_t * iv, + uint8_t const *aad, + uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len) +{ + aes_gcm_init_256(key_data, context, iv, aad, aad_len); + uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors + uint32_t i; + uint32_t offset = 0, dist; + + for (i = 0; i < sizeof(test_sequence); i++) { + dist = test_sequence[i]; + if (offset + dist > len) + break; + aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, dist); + offset += dist; + } + + aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, len - offset); + aes_gcm_dec_256_finalize(key_data, context, auth_tag, auth_tag_len); + +} + +int test_gcm256_std_stream_vectors(gcm_vector const *vector) +{ + struct gcm_key_data gkey; + struct gcm_context_data gctx; + int OK = 0; + // Temporary array for the calculated vectors + uint8_t *ct_test = NULL; + uint8_t *pt_test = NULL; + uint8_t *IV_c = NULL; + uint8_t *T_test = NULL; + uint8_t *T2_test = NULL; + uint64_t IV_alloc_len = 0; + + // Allocate space for the calculated ciphertext + ct_test = malloc(vector->Plen); + // Allocate space for the plain text + pt_test = malloc(vector->Plen); + if ((ct_test == NULL) || (pt_test == NULL)) { + fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n"); + return 1; + } + IV_alloc_len = vector->IVlen; + // Allocate space for the IV + IV_c = malloc(IV_alloc_len); + if (IV_c == NULL) { + fprintf(stderr, "Can't allocate IV memory\n"); + return 1; + } + memcpy(IV_c, vector->IV, vector->IVlen); + + T_test = malloc(vector->Tlen); + T2_test = malloc(vector->Tlen); + if (T_test == NULL) { + fprintf(stderr, "Can't allocate tag memory\n"); + return 1; + } + // This is only required once for a given key + aes_gcm_pre_256(vector->K, &gkey); + + //// + // ISA-l Encrypt + //// + memset(ct_test, 0, vector->Plen); + aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)"); + + // test of in-place encrypt + memcpy(pt_test, vector->P, vector->Plen); + aes_gcm_stream_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= + check_data(pt_test, vector->C, vector->Plen, + "ISA-L encrypted cypher text(in-place)"); + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)"); + memset(ct_test, 0, vector->Plen); + memset(T_test, 0, vector->Tlen); + + //// + // ISA-l Decrypt + //// + aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)"); + // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value + OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)"); + + // test in in-place decrypt + memcpy(ct_test, vector->C, vector->Plen); + aes_gcm_stream_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T_test, vector->Tlen); + OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place"); + OK |= + check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place"); + // ISA-L enc -> ISA-L dec + aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen, + IV_c, vector->A, vector->Alen, T_test, vector->Tlen); + memset(pt_test, 0, vector->Plen); + aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c, + vector->A, vector->Alen, T2_test, vector->Tlen); + OK |= + check_data(pt_test, vector->P, vector->Plen, + "ISA-L self decrypted plain text (P)"); + OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)"); + + if (NULL != ct_test) + free(ct_test); + if (NULL != pt_test) + free(pt_test); + if (NULL != IV_c) + free(IV_c); + if (NULL != T_test) + free(T_test); + if (NULL != T2_test) + free(T2_test); + + return OK; +} +#endif + +int test_gcm_std_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors new api:\n"); + for (vect = 0; (vect < vectors_cnt); vect++) { +#ifdef DEBUG + printf("Standard vector new api %d/%d" + " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + if (BITS_128 == gcm_vectors[vect].Klen) + OK |= test_gcm128_std_vectors(&gcm_vectors[vect]); + else + OK |= test_gcm256_std_vectors(&gcm_vectors[vect]); + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} + +#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST) +/** + * Stream API test with standard vectors + */ +int test_gcm_std_strm_vectors(void) +{ + int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); + int vect; + int OK = 0; + + printf("AES-GCM standard test vectors stream api:\n"); + for (vect = 0; (vect < vectors_cnt); vect++) { +#ifdef DEBUG + printf("Standard vector stream api %d/%d" + " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n", + vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen, + (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen, + (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen); +#else + printf("."); +#endif + if (BITS_128 == gcm_vectors[vect].Klen) + OK |= test_gcm128_std_stream_vectors(&gcm_vectors[vect]); + else + OK |= test_gcm256_std_stream_vectors(&gcm_vectors[vect]); + if (0 != OK) + return OK; + } + printf("\n"); + return OK; +} +#endif +int main(int argc, char **argv) +{ + int errors = 0; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + errors += test_gcm_std_vectors(); +#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST) + errors += test_gcm_std_strm_vectors(); +#endif + + if (0 == errors) + printf("...Pass\n"); + else + printf("...Fail\n"); + + return errors; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm new file mode 100644 index 000000000..dac7c5912 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm @@ -0,0 +1,4296 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; Tomasz Kantecki +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; The details of the implementation is explained in: +; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "reg_sizes.asm" +%include "clear_regs.asm" +%include "gcm_keys_vaes_avx512.asm" +%include "gcm_defines.asm" +%include "memcpy.asm" +%include "aes_common.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx512.asm!" +%endif +%endif +%endif + +%ifndef FUNCT_EXTENSION +%define FUNCT_EXTENSION +%endif + +;; Decide on AES-GCM key size to compile for +%ifdef GCM128_MODE +%define NROUNDS 9 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM192_MODE +%define NROUNDS 11 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION +%endif + +%ifdef GCM256_MODE +%define NROUNDS 13 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION +%endif + +%if (AS_FEATURE_LEVEL) >= 10 + +section .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Stack frame definition +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE (10*16) ; space for 10 XMM registers + %define GP_STORAGE ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment +%else + %define XMM_STORAGE 0 + %define GP_STORAGE (8*8) ; space for 7 GP registers + 1 for alignment +%endif +%ifdef GCM_BIG_DATA +%define LOCAL_STORAGE (128*16) ; space for up to 128 AES blocks +%else +%define LOCAL_STORAGE (48*16) ; space for up to 48 AES blocks +%endif + +;;; sequence is (bottom-up): GP, XMM, local +%define STACK_GP_OFFSET 0 +%define STACK_XMM_OFFSET (STACK_GP_OFFSET + GP_STORAGE) +%define STACK_LOCAL_OFFSET (STACK_XMM_OFFSET + XMM_STORAGE) +%define STACK_FRAME_SIZE (STACK_LOCAL_OFFSET + LOCAL_STORAGE) + +;; for compatibility with stack argument definitions in gcm_defines.asm +%define STACK_OFFSET 0 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; =========================================================================== +;;; =========================================================================== +;;; Horizontal XOR - 4 x 128bits xored together +%macro VHPXORI4x128 2 +%define %%REG %1 ; [in/out] ZMM with 4x128bits to xor; 128bit output +%define %%TMP %2 ; [clobbered] ZMM temporary register + vextracti64x4 YWORD(%%TMP), %%REG, 1 + vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP) + vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1 + vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP) +%endmacro ; VHPXORI4x128 + +;;; =========================================================================== +;;; =========================================================================== +;;; Horizontal XOR - 2 x 128bits xored together +%macro VHPXORI2x128 2 +%define %%REG %1 ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output +%define %%TMP %2 ; [clobbered] XMM/YMM/ZMM temporary register + vextracti32x4 XWORD(%%TMP), %%REG, 1 + vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP) +%endmacro ; VHPXORI2x128 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply - 1st step +%macro VCLMUL_STEP1 6-7 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [in] previous blocks 4 to 7 +%define %%TMP %3 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TH %4 ; [out] high product +%define %%TM %5 ; [out] medium product +%define %%TL %6 ; [out] low product +%define %%HKEY %7 ; [in/optional] hash key for multiplication + +%if %0 == 6 + vmovdqu64 %%TMP, [%%KP + HashKey_4] +%else + vmovdqa64 %%TMP, %%HKEY +%endif + vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1 + vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0 + vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1 + vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL] +%endmacro ; VCLMUL_STEP1 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply - 2nd step +%macro VCLMUL_STEP2 9-11 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [out] ghash high 128 bits +%define %%LO %3 ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out) +%define %%TMP0 %4 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TMP1 %5 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TMP2 %6 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TH %7 ; [in] high product +%define %%TM %8 ; [in] medium product +%define %%TL %9 ; [in] low product +%define %%HKEY %10 ; [in/optional] hash key for multiplication +%define %%HXOR %11 ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none) + +%if %0 == 9 + vmovdqu64 %%TMP0, [%%KP + HashKey_8] +%else + vmovdqa64 %%TMP0, %%HKEY +%endif + vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1 + vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1 + vpxorq %%TH, %%TH, %%TMP2 + vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0 + vpxorq %%TL, %%TL, %%TMP2 + vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0 + vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0 + + ;; finish multiplications + vpsrldq %%TMP2, %%TM, 8 + vpxorq %%HI, %%TH, %%TMP2 + vpslldq %%TMP2, %%TM, 8 + vpxorq %%LO, %%TL, %%TMP2 + + ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8] + ;; note: (X1+Y0) handled elsewhere +%if %0 < 11 + VHPXORI4x128 %%HI, %%TMP2 + VHPXORI4x128 %%LO, %%TMP1 +%else +%if %%HXOR == 4 + VHPXORI4x128 %%HI, %%TMP2 + VHPXORI4x128 %%LO, %%TMP1 +%elif %%HXOR == 2 + VHPXORI2x128 %%HI, %%TMP2 + VHPXORI2x128 %%LO, %%TMP1 +%endif ; HXOR + ;; for HXOR == 1 there is nothing to be done +%endif ; !(%0 < 11) + ;; HIx holds top 128 bits + ;; LOx holds low 128 bits + ;; - further reductions to follow +%endmacro ; VCLMUL_STEP2 + +;;; =========================================================================== +;;; =========================================================================== +;;; AVX512 reduction macro +%macro VCLMUL_REDUCE 6 +%define %%OUT %1 ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128) +%define %%POLY %2 ; [in] zmm/ymm/xmm: polynomial +%define %%HI128 %3 ; [in] zmm/ymm/xmm: high 128b of hash to reduce +%define %%LO128 %4 ; [in] zmm/ymm/xmm: low 128b of hash to reduce +%define %%TMP0 %5 ; [in] zmm/ymm/xmm: temporary register +%define %%TMP1 %6 ; [in] zmm/ymm/xmm: temporary register + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; first phase of the reduction + vpclmulqdq %%TMP0, %%POLY, %%LO128, 0x01 + vpslldq %%TMP0, %%TMP0, 8 ; shift-L 2 DWs + vpxorq %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; second phase of the reduction + vpclmulqdq %%TMP1, %%POLY, %%TMP0, 0x00 + vpsrldq %%TMP1, %%TMP1, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R + + vpclmulqdq %%OUT, %%POLY, %%TMP0, 0x10 + vpslldq %%OUT, %%OUT, 4 ; shift-L 1-DW to obtain result with no shifts + + vpternlogq %%OUT, %%TMP1, %%HI128, 0x96 ; OUT/GHASH = OUT xor TMP1 xor HI128 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endmacro + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply (1 to 8 blocks) - 1st step +%macro VCLMUL_1_TO_8_STEP1 8 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [in] ZMM ciphered blocks 4 to 7 +%define %%TMP1 %3 ; [clobbered] ZMM temporary +%define %%TMP2 %4 ; [clobbered] ZMM temporary +%define %%TH %5 ; [out] ZMM high product +%define %%TM %6 ; [out] ZMM medium product +%define %%TL %7 ; [out] ZMM low product +%define %%NBLOCKS %8 ; [in] number of blocks to ghash (0 to 8) + +%if %%NBLOCKS == 8 + VCLMUL_STEP1 %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL +%elif %%NBLOCKS == 7 + vmovdqu64 %%TMP2, [%%KP + HashKey_3] + vmovdqa64 %%TMP1, [rel mask_out_top_block] + vpandq %%TMP2, %%TMP1 + vpandq %%HI, %%TMP1 + VCLMUL_STEP1 NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2 +%elif %%NBLOCKS == 6 + vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2] + VCLMUL_STEP1 NULL, YWORD(%%HI), YWORD(%%TMP1), \ + YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2) +%elif %%NBLOCKS == 5 + vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1] + VCLMUL_STEP1 NULL, XWORD(%%HI), XWORD(%%TMP1), \ + XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2) +%else + vpxorq %%TH, %%TH + vpxorq %%TM, %%TM + vpxorq %%TL, %%TL +%endif +%endmacro ; VCLMUL_1_TO_8_STEP1 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply (1 to 8 blocks) - 2nd step +%macro VCLMUL_1_TO_8_STEP2 10 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [out] ZMM ghash high 128bits +%define %%LO %3 ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out) +%define %%TMP0 %4 ; [clobbered] ZMM temporary +%define %%TMP1 %5 ; [clobbered] ZMM temporary +%define %%TMP2 %6 ; [clobbered] ZMM temporary +%define %%TH %7 ; [in/clobbered] ZMM high sum +%define %%TM %8 ; [in/clobbered] ZMM medium sum +%define %%TL %9 ; [in/clobbered] ZMM low sum +%define %%NBLOCKS %10 ; [in] number of blocks to ghash (0 to 8) + +%if %%NBLOCKS == 8 + VCLMUL_STEP2 %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL +%elif %%NBLOCKS == 7 + vmovdqu64 %%TMP2, [%%KP + HashKey_7] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 6 + vmovdqu64 %%TMP2, [%%KP + HashKey_6] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 5 + vmovdqu64 %%TMP2, [%%KP + HashKey_5] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 4 + vmovdqu64 %%TMP2, [%%KP + HashKey_4] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 3 + vmovdqu64 %%TMP2, [%%KP + HashKey_3] + vmovdqa64 %%TMP1, [rel mask_out_top_block] + vpandq %%TMP2, %%TMP1 + vpandq %%LO, %%TMP1 + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 2 + vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2] + VCLMUL_STEP2 NULL, YWORD(%%HI), YWORD(%%LO), \ + YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \ + YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2 +%elif %%NBLOCKS == 1 + vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1] + VCLMUL_STEP2 NULL, XWORD(%%HI), XWORD(%%LO), \ + XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \ + XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1 +%else + vpxorq %%HI, %%HI + vpxorq %%LO, %%LO +%endif +%endmacro ; VCLMUL_1_TO_8_STEP2 + +;;; =========================================================================== +;;; =========================================================================== +;;; GHASH 1 to 16 blocks of cipher text +;;; - performs reduction at the end +;;; - can take intermediate GHASH sums as input +%macro GHASH_1_TO_16 20 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%GHASH %2 ; [out] ghash output +%define %%T1 %3 ; [clobbered] temporary ZMM +%define %%T2 %4 ; [clobbered] temporary ZMM +%define %%T3 %5 ; [clobbered] temporary ZMM +%define %%T4 %6 ; [clobbered] temporary ZMM +%define %%T5 %7 ; [clobbered] temporary ZMM +%define %%T6 %8 ; [clobbered] temporary ZMM +%define %%T7 %9 ; [clobbered] temporary ZMM +%define %%T8 %10 ; [clobbered] temporary ZMM +%define %%T9 %11 ; [clobbered] temporary ZMM +%define %%GH %12 ; [in/cloberred] ghash sum (high) or "no_zmm" +%define %%GL %13 ; [in/cloberred] ghash sum (low) or "no_zmm" +%define %%GM %14 ; [in/cloberred] ghash sum (medium) or "no_zmm" +%define %%AAD_HASH_IN %15 ; [in] input hash value +%define %%CIPHER_IN0 %16 ; [in] ZMM with cipher text blocks 0-3 +%define %%CIPHER_IN1 %17 ; [in] ZMM with cipher text blocks 4-7 +%define %%CIPHER_IN2 %18 ; [in] ZMM with cipher text blocks 8-11 +%define %%CIPHER_IN3 %19 ; [in] ZMM with cipher text blocks 12-15 +%define %%NUM_BLOCKS %20 ; [in] numerical value, number of blocks + +%define %%T0H %%T1 +%define %%T0L %%T2 +%define %%T0M1 %%T3 +%define %%T0M2 %%T4 + +%define %%T1H %%T5 +%define %%T1L %%T6 +%define %%T1M1 %%T7 +%define %%T1M2 %%T8 + +%define %%HK %%T9 + +%assign hashk HashKey_ %+ %%NUM_BLOCKS +%assign reg_idx 0 +%assign blocks_left %%NUM_BLOCKS + + vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN + +%assign first_result 1 + +%ifnidn %%GH, no_zmm +%ifnidn %%GM, no_zmm +%ifnidn %%GL, no_zmm + ;; GHASH sums passed in to be updated and + ;; reduced at the end + vmovdqa64 %%T0H, %%GH + vmovdqa64 %%T0L, %%GL + vmovdqa64 %%T0M1, %%GM + vpxorq %%T0M2, %%T0M2 +%assign first_result 0 +%endif +%endif +%endif + +%rep (blocks_left / 4) +%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx + vmovdqu64 %%HK, [%%KP + hashk] +%if first_result == 1 + vpclmulqdq %%T0H, %%REG_IN, %%HK, 0x11 ; H = a1*b1 + vpclmulqdq %%T0L, %%REG_IN, %%HK, 0x00 ; L = a0*b0 + vpclmulqdq %%T0M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0 + vpclmulqdq %%T0M2, %%REG_IN, %%HK, 0x10 ; TM2 = a0*b1 +%assign first_result 0 +%else + vpclmulqdq %%T1H, %%REG_IN, %%HK, 0x11 ; H = a1*b1 + vpclmulqdq %%T1L, %%REG_IN, %%HK, 0x00 ; L = a0*b0 + vpclmulqdq %%T1M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0 + vpclmulqdq %%T1M2, %%REG_IN, %%HK, 0x10 ; M2 = a0*b1 + vpxorq %%T0H, %%T0H, %%T1H + vpxorq %%T0L, %%T0L, %%T1L + vpxorq %%T0M1, %%T0M1, %%T1M1 + vpxorq %%T0M2, %%T0M2, %%T1M2 +%endif +%undef %%REG_IN +%assign reg_idx (reg_idx + 1) +%assign hashk (hashk + 64) +%assign blocks_left (blocks_left - 4) +%endrep + +%if blocks_left > 0 +;; There are 1, 2 or 3 blocks left to process. +;; It may also be that they are the only blocks to process. + +%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx + +%if first_result == 1 +;; Case where %%NUM_BLOCKS = 1, 2 or 3 +%xdefine %%OUT_H %%T0H +%xdefine %%OUT_L %%T0L +%xdefine %%OUT_M1 %%T0M1 +%xdefine %%OUT_M2 %%T0M2 +%else +%xdefine %%OUT_H %%T1H +%xdefine %%OUT_L %%T1L +%xdefine %%OUT_M1 %%T1M1 +%xdefine %%OUT_M2 %%T1M2 +%endif + +%if blocks_left == 1 + vmovdqu64 XWORD(%%HK), [%%KP + hashk] + vpclmulqdq XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11 ; %%TH = a1*b1 + vpclmulqdq XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00 ; %%TL = a0*b0 + vpclmulqdq XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01 ; %%TM1 = a1*b0 + vpclmulqdq XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10 ; %%TM2 = a0*b1 +%elif blocks_left == 2 + vmovdqu64 YWORD(%%HK), [%%KP + hashk] + vpclmulqdq YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11 ; %%TH = a1*b1 + vpclmulqdq YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00 ; %%TL = a0*b0 + vpclmulqdq YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01 ; %%TM1 = a1*b0 + vpclmulqdq YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10 ; %%TM2 = a0*b1 +%else ; blocks_left == 3 + vmovdqu64 YWORD(%%HK), [%%KP + hashk] + vinserti64x2 %%HK, [%%KP + hashk + 32], 2 + vpclmulqdq %%OUT_H, %%REG_IN, %%HK, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%OUT_L, %%REG_IN, %%HK, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%OUT_M1, %%REG_IN, %%HK, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%OUT_M2, %%REG_IN, %%HK, 0x10 ; %%TM2 = a0*b1 +%endif ; blocks_left + +%undef %%REG_IN +%undef %%OUT_H +%undef %%OUT_L +%undef %%OUT_M1 +%undef %%OUT_M2 + +%if first_result != 1 + vpxorq %%T0H, %%T0H, %%T1H + vpxorq %%T0L, %%T0L, %%T1L + vpxorq %%T0M1, %%T0M1, %%T1M1 + vpxorq %%T0M2, %%T0M2, %%T1M2 +%endif + +%endif ; blocks_left > 0 + + ;; integrate TM into TH and TL + vpxorq %%T0M1, %%T0M1, %%T0M2 + vpsrldq %%T1M1, %%T0M1, 8 + vpslldq %%T1M2, %%T0M1, 8 + vpxorq %%T0H, %%T0H, %%T1M1 + vpxorq %%T0L, %%T0L, %%T1M2 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%T0H, %%T1M1 + VHPXORI4x128 %%T0L, %%T1M2 + + ;; reduction + vmovdqa64 XWORD(%%HK), [rel POLY2] + VCLMUL_REDUCE XWORD(%%GHASH), XWORD(%%HK), \ + XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2) +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +;;; Input: A and B (128-bits each, bit-reflected) +;;; Output: C = A*B*x mod poly, (i.e. >>1 ) +;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 + vpxorq %%GH, %%GH, %%T3 + + + vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs + vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs + + vpxorq %%T1, %%T1, %%T3 + vpxorq %%GH, %%GH, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu64 %%T3, [rel POLY2] + + vpclmulqdq %%T2, %%T3, %%GH, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs + + vpxorq %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%GH, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R + + vpclmulqdq %%GH, %%T3, %%GH, 0x10 + vpslldq %%GH, %%GH, 4 ; Shift-L 1-DW to obtain result with no shifts + + ; second phase of the reduction complete, the result is in %%GH + vpternlogq %%GH, %%T1, %%T2, 0x96 ; GH = GH xor T1 xor T2 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512 +;;; functions, but are kept to allow users to switch cpu architectures between calls +;;; of pre, init, update, and finalize. +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + vmovdqa %%T5, %%HK + + ;; GHASH keys 2 to 48 or 128 +%ifdef GCM_BIG_DATA +%assign max_hkey_idx 128 +%else +%assign max_hkey_idx 48 +%endif + +%assign i 2 +%rep (max_hkey_idx - 1) + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^i<<1 mod poly + vmovdqu [%%GDATA + HashKey_ %+ i], %%T5 ; [HashKey_i] = %%T5 +%assign i (i + 1) +%endrep + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; READ_SMALL_DATA_INPUT +;;; Packs xmm register with data when data input is less or equal to 16 bytes +;;; Returns 0 if data has length 0 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 5 +%define %%OUTPUT %1 ; [out] xmm register +%define %%INPUT %2 ; [in] buffer pointer to read from +%define %%LENGTH %3 ; [in] number of bytes to read +%define %%TMP1 %4 ; [clobbered] +%define %%MASK %5 ; [out] k1 to k7 register to store the partial block mask + + cmp %%LENGTH, 16 + jge %%_read_small_data_ge16 + lea %%TMP1, [rel byte_len_to_mask_table] +%ifidn __OUTPUT_FORMAT__, win64 + add %%TMP1, %%LENGTH + add %%TMP1, %%LENGTH + kmovw %%MASK, [%%TMP1] +%else + kmovw %%MASK, [%%TMP1 + %%LENGTH*2] +%endif + vmovdqu8 %%OUTPUT{%%MASK}{z}, [%%INPUT] + jmp %%_read_small_data_end +%%_read_small_data_ge16: + VX512LDR %%OUTPUT, [%%INPUT] + mov %%TMP1, 0xffff + kmovq %%MASK, %%TMP1 +%%_read_small_data_end: +%endmacro ; READ_SMALL_DATA_INPUT + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 18 +%define %%A_IN %1 ; [in] AAD text pointer +%define %%A_LEN %2 ; [in] AAD length +%define %%AAD_HASH %3 ; [out] xmm ghash value +%define %%GDATA_KEY %4 ; [in] pointer to keys +%define %%ZT0 %5 ; [clobbered] ZMM register +%define %%ZT1 %6 ; [clobbered] ZMM register +%define %%ZT2 %7 ; [clobbered] ZMM register +%define %%ZT3 %8 ; [clobbered] ZMM register +%define %%ZT4 %9 ; [clobbered] ZMM register +%define %%ZT5 %10 ; [clobbered] ZMM register +%define %%ZT6 %11 ; [clobbered] ZMM register +%define %%ZT7 %12 ; [clobbered] ZMM register +%define %%ZT8 %13 ; [clobbered] ZMM register +%define %%ZT9 %14 ; [clobbered] ZMM register +%define %%T1 %15 ; [clobbered] GP register +%define %%T2 %16 ; [clobbered] GP register +%define %%T3 %17 ; [clobbered] GP register +%define %%MASKREG %18 ; [clobbered] mask register + +%define %%SHFMSK %%ZT9 +%define %%POLY %%ZT8 +%define %%TH %%ZT7 +%define %%TM %%ZT6 +%define %%TL %%ZT5 + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxorq %%AAD_HASH, %%AAD_HASH + + vmovdqa64 %%SHFMSK, [rel SHUF_MASK] + vmovdqa64 %%POLY, [rel POLY2] + +%%_get_AAD_loop128: + cmp %%T2, 128 + jl %%_exit_AAD_loop128 + + vmovdqu64 %%ZT2, [%%T1 + 64*0] ; LO blocks (0-3) + vmovdqu64 %%ZT1, [%%T1 + 64*1] ; HI blocks (4-7) + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + + VCLMUL_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL + VCLMUL_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL + + ;; result in %%ZT1(H):%%ZT2(L) + ;; reduce and put the result in AAD_HASH + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \ + XWORD(%%ZT0), XWORD(%%ZT3) + + sub %%T2, 128 + je %%_CALC_AAD_done + + add %%T1, 128 + jmp %%_get_AAD_loop128 + +%%_exit_AAD_loop128: + or %%T2, %%T2 + jz %%_CALC_AAD_done + + ;; prep mask source address + lea %%T3, [rel byte64_len_to_mask_table] + lea %%T3, [%%T3 + %%T2*8] + + ;; calculate number of blocks to ghash (including partial bytes) + add %%T2, 15 + and %%T2, -16 ; 1 to 8 blocks possible here + shr %%T2, 4 + cmp %%T2, 7 + je %%_AAD_blocks_7 + cmp %%T2, 6 + je %%_AAD_blocks_6 + cmp %%T2, 5 + je %%_AAD_blocks_5 + cmp %%T2, 4 + je %%_AAD_blocks_4 + cmp %%T2, 3 + je %%_AAD_blocks_3 + cmp %%T2, 2 + je %%_AAD_blocks_2 + cmp %%T2, 1 + je %%_AAD_blocks_1 + ;; fall through for 8 blocks + + ;; The flow of each of these cases is identical: + ;; - load blocks plain text + ;; - shuffle loaded blocks + ;; - xor in current hash value into block 0 + ;; - perform up multiplications with ghash keys + ;; - jump to reduction code +%%_AAD_blocks_8: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 8 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_7: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 7 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_6: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb YWORD(%%ZT1), YWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 6 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_5: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb XWORD(%%ZT1), XWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 5 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_4: + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb %%ZT2, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 4 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_3: + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb %%ZT2, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 3 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_2: + kmovq %%MASKREG, [%%T3] + vmovdqu8 YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb YWORD(%%ZT2), YWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 2 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_1: + kmovq %%MASKREG, [%%T3] + vmovdqu8 XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb XWORD(%%ZT2), XWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 1 + +%%_AAD_blocks_done: + ;; Multiplications have been done. Do the reduction now + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \ + XWORD(%%ZT0), XWORD(%%ZT3) +%%_CALC_AAD_done: + ;; result in AAD_HASH + +%endmacro ; CALC_AAD_HASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; PARTIAL_BLOCK +;;; Handles encryption/decryption and the tag partial blocks between +;;; update calls. +;;; Requires the input data be at least 1 byte long. +;;; Output: +;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT), +;;; AAD_HASH and updated GDATA_CTX +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 22 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%DATA_OFFSET %6 ; [in/out] data offset (gets updated) +%define %%AAD_HASH %7 ; [out] updated GHASH value +%define %%ENC_DEC %8 ; [in] cipher direction +%define %%GPTMP0 %9 ; [clobbered] GP temporary register +%define %%GPTMP1 %10 ; [clobbered] GP temporary register +%define %%GPTMP2 %11 ; [clobbered] GP temporary register +%define %%ZTMP0 %12 ; [clobbered] ZMM temporary register +%define %%ZTMP1 %13 ; [clobbered] ZMM temporary register +%define %%ZTMP2 %14 ; [clobbered] ZMM temporary register +%define %%ZTMP3 %15 ; [clobbered] ZMM temporary register +%define %%ZTMP4 %16 ; [clobbered] ZMM temporary register +%define %%ZTMP5 %17 ; [clobbered] ZMM temporary register +%define %%ZTMP6 %18 ; [clobbered] ZMM temporary register +%define %%ZTMP7 %19 ; [clobbered] ZMM temporary register +%define %%ZTMP8 %20 ; [clobbered] ZMM temporary register +%define %%ZTMP9 %21 ; [clobbered] ZMM temporary register +%define %%MASKREG %22 ; [clobbered] mask temporary register + +%define %%XTMP0 XWORD(%%ZTMP0) +%define %%XTMP1 XWORD(%%ZTMP1) +%define %%XTMP2 XWORD(%%ZTMP2) +%define %%XTMP3 XWORD(%%ZTMP3) +%define %%XTMP4 XWORD(%%ZTMP4) +%define %%XTMP5 XWORD(%%ZTMP5) +%define %%XTMP6 XWORD(%%ZTMP6) +%define %%XTMP7 XWORD(%%ZTMP7) +%define %%XTMP8 XWORD(%%ZTMP8) +%define %%XTMP9 XWORD(%%ZTMP9) + +%define %%LENGTH %%GPTMP0 +%define %%IA0 %%GPTMP1 +%define %%IA1 %%GPTMP2 + + mov %%LENGTH, [%%GDATA_CTX + PBlockLen] + or %%LENGTH, %%LENGTH + je %%_partial_block_done ;Leave Macro if no partial blocks + + READ_SMALL_DATA_INPUT %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG + + ;; XTMP1 = my_ctx_data.partial_block_enc_key + vmovdqu64 %%XTMP1, [%%GDATA_CTX + PBlockEncKey] + vmovdqu64 %%XTMP2, [%%GDATA_KEY + HashKey] + + ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes + ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16) + lea %%IA0, [rel SHIFT_MASK] + add %%IA0, %%LENGTH + vmovdqu64 %%XTMP3, [%%IA0] ; shift right shuffle mask + vpshufb %%XTMP1, %%XTMP3 + +%ifidn %%ENC_DEC, DEC + ;; keep copy of cipher text in %%XTMP4 + vmovdqa64 %%XTMP4, %%XTMP0 +%endif + vpxorq %%XTMP1, %%XTMP0 ; Cyphertext XOR E(K, Yn) + + ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block + ;; Determine if partial block is not being filled and shift mask accordingly + mov %%IA1, %%PLAIN_CYPH_LEN + add %%IA1, %%LENGTH + sub %%IA1, 16 + jge %%_no_extra_mask + sub %%IA0, %%IA1 +%%_no_extra_mask: + ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1 + ;; - mask out bottom %%LENGTH bytes of %%XTMP1 + vmovdqu64 %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK] + vpand %%XTMP1, %%XTMP0 + +%ifidn %%ENC_DEC, DEC + vpand %%XTMP4, %%XTMP0 + vpshufb %%XTMP4, [rel SHUF_MASK] + vpshufb %%XTMP4, %%XTMP3 + vpxorq %%AAD_HASH, %%XTMP4 +%else + vpshufb %%XTMP1, [rel SHUF_MASK] + vpshufb %%XTMP1, %%XTMP3 + vpxorq %%AAD_HASH, %%XTMP1 +%endif + cmp %%IA1, 0 + jl %%_partial_incomplete + + ;; GHASH computation for the last <16 Byte block + GHASH_MUL %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9 + + mov qword [%%GDATA_CTX + PBlockLen], 0 + + ;; Set %%IA1 to be the number of bytes to write out + mov %%IA0, %%LENGTH + mov %%LENGTH, 16 + sub %%LENGTH, %%IA0 + jmp %%_enc_dec_done + +%%_partial_incomplete: +%ifidn __OUTPUT_FORMAT__, win64 + mov %%IA0, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], %%IA0 +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif + mov %%LENGTH, %%PLAIN_CYPH_LEN + +%%_enc_dec_done: + ;; output encrypted Bytes + + lea %%IA0, [rel byte_len_to_mask_table] + kmovw %%MASKREG, [%%IA0 + %%LENGTH*2] + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH + +%ifidn %%ENC_DEC, ENC + ;; shuffle XTMP1 back to output as ciphertext + vpshufb %%XTMP1, [rel SHUF_MASK] + vpshufb %%XTMP1, %%XTMP3 +%endif + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1 + add %%DATA_OFFSET, %%LENGTH +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +%macro GHASH_SINGLE_MUL 9 +%define %%GDATA %1 +%define %%HASHKEY %2 +%define %%CIPHER %3 +%define %%STATE_11 %4 +%define %%STATE_00 %5 +%define %%STATE_MID %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%FIRST %9 + + vmovdqu %%T1, [%%GDATA + %%HASHKEY] +%ifidn %%FIRST, first + vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 + vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%else + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 + vpxor %%STATE_11, %%STATE_11, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 + vpxor %%STATE_00, %%STATE_00, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 + vpxor %%STATE_MID, %%STATE_MID, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%endif + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL +;;; macro code. It is called only for data lenghts 128 and above. +;;; The flow is as follows: +;;; - encrypt the initial %%num_initial_blocks blocks (can be 0) +;;; - encrypt the next 8 blocks and stitch with +;;; GHASH for the first %%num_initial_blocks +;;; - the last 8th block can be partial (lengths between 129 and 239) +;;; - partial block ciphering is handled within this macro +;;; - top bytes of such block are cleared for +;;; the subsequent GHASH calculations +;;; - PBlockEncKey needs to be setup in case of multi-call +;;; - top bytes of the block need to include encrypted counter block so that +;;; when handling partial block case text is read and XOR'ed against it. +;;; This needs to be in un-shuffled format. + +%macro INITIAL_BLOCKS 26-27 +%define %%GDATA_KEY %1 ; [in] pointer to GCM keys +%define %%GDATA_CTX %2 ; [in] pointer to GCM context +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%LENGTH %5 ; [in/out] number of bytes to process +%define %%DATA_OFFSET %6 ; [in/out] data offset +%define %%num_initial_blocks %7 ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%CTR %8 ; [in/out] XMM counter block +%define %%AAD_HASH %9 ; [in/out] ZMM with AAD hash +%define %%ZT1 %10 ; [out] ZMM cipher blocks 0-3 for GHASH +%define %%ZT2 %11 ; [out] ZMM cipher blocks 4-7 for GHASH +%define %%ZT3 %12 ; [clobbered] ZMM temporary +%define %%ZT4 %13 ; [clobbered] ZMM temporary +%define %%ZT5 %14 ; [clobbered] ZMM temporary +%define %%ZT6 %15 ; [clobbered] ZMM temporary +%define %%ZT7 %16 ; [clobbered] ZMM temporary +%define %%ZT8 %17 ; [clobbered] ZMM temporary +%define %%ZT9 %18 ; [clobbered] ZMM temporary +%define %%ZT10 %19 ; [clobbered] ZMM temporary +%define %%ZT11 %20 ; [clobbered] ZMM temporary +%define %%ZT12 %21 ; [clobbered] ZMM temporary +%define %%IA0 %22 ; [clobbered] GP temporary +%define %%IA1 %23 ; [clobbered] GP temporary +%define %%ENC_DEC %24 ; [in] ENC/DEC selector +%define %%MASKREG %25 ; [clobbered] mask register +%define %%SHUFMASK %26 ; [in] ZMM with BE/LE shuffle mask +%define %%PARTIAL_PRESENT %27 ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes) + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T3 XWORD(%%ZT3) +%define %%T4 XWORD(%%ZT4) +%define %%T5 XWORD(%%ZT5) +%define %%T6 XWORD(%%ZT6) +%define %%T7 XWORD(%%ZT7) +%define %%T8 XWORD(%%ZT8) +%define %%T9 XWORD(%%ZT9) + +%define %%TH %%ZT10 +%define %%TM %%ZT11 +%define %%TL %%ZT12 + +;; determine if partial block code needs to be added +%assign partial_block_possible 1 +%if %0 > 26 +%ifidn %%PARTIAL_PRESENT, no_partial_block +%assign partial_block_possible 0 +%endif +%endif + +%if %%num_initial_blocks > 0 + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vpaddd %%T3, %%CTR, [rel ONE] +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678] +%endif + + ;; extract new counter value (%%T3) + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%ZT3, (%%num_initial_blocks - 1) +%else + vextracti32x4 %%CTR, %%ZT4, (%%num_initial_blocks - 5) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK + + ;; load plain/cipher text + ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT5, %%ZT6, no_zmm, no_zmm + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (NROUNDS + 2) + vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT1, j, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%num_initial_blocks, NROUNDS +%assign j (j + 1) +%endrep + + ;; write cipher/plain text back to output and + ;; zero bytes outside the mask before hashing + ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in ZT3 & ZT4 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%endif ; Encrypt + + ;; adjust data offset and length + sub %%LENGTH, (%%num_initial_blocks * 16) + add %%DATA_OFFSET, (%%num_initial_blocks * 16) + + ;; At this stage + ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed + +%endif ; %%num_initial_blocks > 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; - cipher of %%num_initial_blocks is done + ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4) + ;; - save the last block in %%CTR + ;; - shuffle the blocks for AES + ;; - stitch encryption of the new blocks with + ;; GHASHING the previous blocks + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678] + vextracti32x4 %%CTR, %%ZT4, 3 + + vpshufb %%ZT3, %%SHUFMASK + vpshufb %%ZT4, %%SHUFMASK + +%if partial_block_possible != 0 + ;; get text load/store mask (assume full mask by default) + mov %%IA0, 0xffff_ffff_ffff_ffff +%if %%num_initial_blocks > 0 + ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0 + ;; This macro is executed for lenght 128 and up, + ;; zero length is checked in GCM_ENC_DEC. + ;; We know there is partial block if: + ;; LENGTH - 16*num_initial_blocks < 128 + cmp %%LENGTH, 128 + jge %%_initial_partial_block_continue + mov %%IA1, rcx + mov rcx, 128 + sub rcx, %%LENGTH + shr %%IA0, cl + mov rcx, %%IA1 +%%_initial_partial_block_continue: +%endif + kmovq %%MASKREG, %%IA0 + ;; load plain or cipher text (masked) + ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG +%else + ;; load plain or cipher text + ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT1, %%ZT2, no_zmm, no_zmm +%endif ;; partial_block_possible + + ;; === AES ROUND 0 +%assign aes_round 0 + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) + + ;; === GHASH blocks 4-7 +%if (%%num_initial_blocks > 0) + ;; Hash in AES state + vpxorq %%ZT5, %%ZT5, %%AAD_HASH + + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \ + %%TH, %%TM, %%TL, %%num_initial_blocks +%endif + + ;; === [1/3] of AES rounds + +%rep ((NROUNDS + 1) / 3) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; %rep ((NROUNDS + 1) / 2) + + ;; === GHASH blocks 0-3 and gather +%if (%%num_initial_blocks > 0) + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \ + %%ZT7, %%ZT8, %%ZT9, \ + %%TH, %%TM, %%TL, %%num_initial_blocks +%endif + + ;; === [2/3] of AES rounds + +%rep ((NROUNDS + 1) / 3) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; %rep ((NROUNDS + 1) / 2) + + ;; === GHASH reduction + +%if (%%num_initial_blocks > 0) + ;; [out] AAD_HASH - hash output + ;; [in] T8 - polynomial + ;; [in] T6 - high, T5 - low + ;; [clobbered] T9, T7 - temporary + vmovdqu64 %%T8, [rel POLY2] + VCLMUL_REDUCE XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9 +%endif + + ;; === [3/3] of AES rounds + +%rep (((NROUNDS + 1) / 3) + 2) +%if aes_round < (NROUNDS + 2) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif +%endrep ; %rep ((NROUNDS + 1) / 2) + +%if partial_block_possible != 0 + ;; write cipher/plain text back to output and + ;; zero bytes outside the mask before hashing + ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG + ;; check if there is partial block + cmp %%LENGTH, 128 + jl %%_initial_save_partial + ;; adjust offset and length + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 + jmp %%_initial_blocks_done +%%_initial_save_partial: + ;; partial block case + ;; - save the partial block in unshuffled format + ;; - ZT4 is partially XOR'ed with data and top bytes contain + ;; encrypted counter block only + ;; - save number of bytes process in the partial block + ;; - adjust offset and zero the length + ;; - clear top bytes of the partial block for subsequent GHASH calculations + vextracti32x4 [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3 + add %%DATA_OFFSET, %%LENGTH + sub %%LENGTH, (128 - 16) + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + xor %%LENGTH, %%LENGTH + vmovdqu8 %%ZT4{%%MASKREG}{z}, %%ZT4 +%%_initial_blocks_done: +%else + ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 +%endif ;; partial_block_possible + + ;; Shuffle AES result for GHASH. +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT1 & ZT2 + vpshufb %%ZT1, %%SHUFMASK + vpshufb %%ZT2, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in ZT3 & ZT4 + vpshufb %%ZT1, %%ZT3, %%SHUFMASK + vpshufb %%ZT2, %%ZT4, %%SHUFMASK +%endif ; Encrypt + + ;; Current hash value is in AAD_HASH + + ;; Combine GHASHed value with the corresponding ciphertext + vpxorq %%ZT1, %%ZT1, %%AAD_HASH + +%endmacro ; INITIAL_BLOCKS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. +;;; It may look similar to INITIAL_BLOCKS but its usage is different: +;;; - first encrypts/decrypts required number of blocks and then +;;; ghashes these blocks +;;; - Small packets or left over data chunks (<256 bytes) +;;; - single or multi call +;;; - Remaining data chunks below 256 bytes (multi buffer code) +;;; +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 41 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] text out pointer +%define %%PLAIN_CYPH_IN %4 ; [in] text out pointer +%define %%LENGTH %5 ; [in/clobbered] length in bytes +%define %%DATA_OFFSET %6 ; [in/out] current data offset (updated) +%define %%num_initial_blocks %7 ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) +%define %%CTR %8 ; [in/out] current counter value +%define %%HASH_IN_OUT %9 ; [in/out] XMM ghash in/out value +%define %%ENC_DEC %10 ; [in] cipher direction (ENC/DEC) +%define %%INSTANCE_TYPE %11 ; [in] multi_call or single_call +%define %%ZT0 %12 ; [clobbered] ZMM temporary +%define %%ZT1 %13 ; [clobbered] ZMM temporary +%define %%ZT2 %14 ; [clobbered] ZMM temporary +%define %%ZT3 %15 ; [clobbered] ZMM temporary +%define %%ZT4 %16 ; [clobbered] ZMM temporary +%define %%ZT5 %17 ; [clobbered] ZMM temporary +%define %%ZT6 %18 ; [clobbered] ZMM temporary +%define %%ZT7 %19 ; [clobbered] ZMM temporary +%define %%ZT8 %20 ; [clobbered] ZMM temporary +%define %%ZT9 %21 ; [clobbered] ZMM temporary +%define %%ZT10 %22 ; [clobbered] ZMM temporary +%define %%ZT11 %23 ; [clobbered] ZMM temporary +%define %%ZT12 %24 ; [clobbered] ZMM temporary +%define %%ZT13 %25 ; [clobbered] ZMM temporary +%define %%ZT14 %26 ; [clobbered] ZMM temporary +%define %%ZT15 %27 ; [clobbered] ZMM temporary +%define %%ZT16 %28 ; [clobbered] ZMM temporary +%define %%ZT17 %29 ; [clobbered] ZMM temporary +%define %%ZT18 %30 ; [clobbered] ZMM temporary +%define %%ZT19 %31 ; [clobbered] ZMM temporary +%define %%ZT20 %32 ; [clobbered] ZMM temporary +%define %%ZT21 %33 ; [clobbered] ZMM temporary +%define %%ZT22 %34 ; [clobbered] ZMM temporary +%define %%GH %35 ; [in] ZMM ghash sum (high) +%define %%GL %36 ; [in] ZMM ghash sum (low) +%define %%GM %37 ; [in] ZMM ghash sum (middle) +%define %%IA0 %38 ; [clobbered] GP temporary +%define %%IA1 %39 ; [clobbered] GP temporary +%define %%MASKREG %40 ; [clobbered] mask register +%define %%SHUFMASK %41 ; [in] ZMM with BE/LE shuffle mask + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T7 XWORD(%%ZT7) + +%define %%CTR0 %%ZT3 +%define %%CTR1 %%ZT4 +%define %%CTR2 %%ZT8 +%define %%CTR3 %%ZT9 + +%define %%DAT0 %%ZT5 +%define %%DAT1 %%ZT6 +%define %%DAT2 %%ZT10 +%define %%DAT3 %%ZT11 + +%ifnidn %%GH, no_zmm +%ifnidn %%GL, no_zmm +%ifnidn %%GM, no_zmm + ;; when temporary sums are passed then zero HASH IN value + ;; - whatever it holds it is invalid in this case + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT +%endif +%endif +%endif + ;; Copy ghash to temp reg + vmovdqa64 %%T2, %%HASH_IN_OUT + + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vpaddd XWORD(%%CTR0), %%CTR, [rel ONE] +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234] +%if %%num_initial_blocks > 4 + vpaddd %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678] +%endif +%if %%num_initial_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888] +%endif +%if %%num_initial_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888] +%endif +%endif + + ;; get load/store mask + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH +%if %%num_initial_blocks > 12 + sub %%IA1, 3 * 64 +%elif %%num_initial_blocks > 8 + sub %%IA1, 2 * 64 +%elif %%num_initial_blocks > 4 + sub %%IA1, 64 +%endif + kmovq %%MASKREG, [%%IA0 + %%IA1*8] + + ;; extract new counter value + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%CTR0, (%%num_initial_blocks - 1) +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%CTR, %%CTR1, (%%num_initial_blocks - 5) +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%CTR, %%CTR2, (%%num_initial_blocks - 9) +%else + vextracti32x4 %%CTR, %%CTR3, (%%num_initial_blocks - 13) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK + + ;; load plain/cipher text + ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (NROUNDS + 2) + vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZT1, j, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%num_initial_blocks, NROUNDS +%assign j (j + 1) +%endrep + + ;; retrieve the last cipher counter block (partially XOR'ed with text) + ;; - this is needed for partial block cases +%if %%num_initial_blocks <= 4 + vextracti32x4 %%T1, %%CTR0, (%%num_initial_blocks - 1) +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%T1, %%CTR1, (%%num_initial_blocks - 5) +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%T1, %%CTR2, (%%num_initial_blocks - 9) +%else + vextracti32x4 %%T1, %%CTR3, (%%num_initial_blocks - 13) +%endif + + ;; write cipher/plain text back to output and + ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG + + ;; zero bytes outside the mask before hashing +%if %%num_initial_blocks <= 4 + vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0 +%elif %%num_initial_blocks <= 8 + vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1 +%elif %%num_initial_blocks <= 12 + vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2 +%else + vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3 +%endif + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in CTR0-CTR3 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%endif ; Encrypt + + ;; Extract the last block for partials and multi_call cases +%if %%num_initial_blocks <= 4 + vextracti32x4 %%T7, %%DAT0, %%num_initial_blocks - 1 +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%T7, %%DAT1, %%num_initial_blocks - 5 +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%T7, %%DAT2, %%num_initial_blocks - 9 +%else + vextracti32x4 %%T7, %%DAT3, %%num_initial_blocks - 13 +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Hash all but the last block of data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; update data offset +%if %%num_initial_blocks > 1 + ;; The final block of data may be <16B + add %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1) + sub %%LENGTH, 16 * (%%num_initial_blocks - 1) +%endif + +%if %%num_initial_blocks < 16 + ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16. + ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256. + cmp %%LENGTH, 16 + jl %%_small_initial_partial_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle a full length final block - encrypt and hash all blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + sub %%LENGTH, 16 + add %%DATA_OFFSET, 16 + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + + ;; Hash all of the data + + ;; ZT2 - incoming AAD hash (low 128bits) + ;; ZT12-ZT20 - temporary registers + GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \ + %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \ + %%ZT17, %%ZT18, %%ZT19, %%ZT20, \ + %%GH, %%GL, %%GM, \ + %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%num_initial_blocks + + jmp %%_small_initial_compute_done +%endif ; %if %%num_initial_blocks < 16 + +%%_small_initial_partial_block: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;; Handle ghash for a <16B final block + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; In this case if it's a single call to encrypt we can + ;; hash all of the data but if it's an init / update / finalize + ;; series of call we need to leave the last block if it's + ;; less than a full block of data. + + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + ;; %%T1 is ciphered counter block + vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%T1 + +%ifidn %%INSTANCE_TYPE, multi_call +%assign k (%%num_initial_blocks - 1) +%assign last_block_to_hash 1 +%else +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 +%endif + +%if (%%num_initial_blocks > last_block_to_hash) + + ;; ZT12-ZT20 - temporary registers + GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \ + %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \ + %%ZT17, %%ZT18, %%ZT19, %%ZT20, \ + %%GH, %%GL, %%GM, \ + %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k + + ;; just fall through no jmp needed +%else + ;; Record that a reduction is not needed - + ;; In this case no hashes are computed because there + ;; is only one initial block and it is < 16B in length. + ;; We only need to check if a reduction is needed if + ;; initial_blocks == 1 and init/update/final is being used. + ;; In this case we may just have a partial block, and that + ;; gets hashed in finalize. + +%assign need_for_reduction 1 +%ifidn %%GH, no_zmm +%ifidn %%GL, no_zmm +%ifidn %%GM, no_zmm +;; if %%GH, %%GL & %%GM not passed then reduction is not required +%assign need_for_reduction 0 +%endif +%endif +%endif + +%if need_for_reduction == 0 + ;; The hash should end up in HASH_IN_OUT. + ;; The only way we should get here is if there is + ;; a partial block of data, so xor that into the hash. + vpxorq %%HASH_IN_OUT, %%T2, %%T7 +%else + ;; right - here we have nothing to ghash in the small data but + ;; we have GHASH sums passed through that we need to gather and reduce + + ;; integrate TM into TH and TL + vpsrldq %%ZT12, %%GM, 8 + vpslldq %%ZT13, %%GM, 8 + vpxorq %%GH, %%GH, %%ZT12 + vpxorq %%GL, %%GL, %%ZT13 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%GH, %%ZT12 + VHPXORI4x128 %%GL, %%ZT13 + + ;; reduction + vmovdqa64 XWORD(%%ZT12), [rel POLY2] + VCLMUL_REDUCE %%HASH_IN_OUT, XWORD(%%ZT12), \ + XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14) + + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7 +%endif + ;; The result is in %%HASH_IN_OUT + jmp %%_after_reduction +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; After GHASH reduction +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_small_initial_compute_done: + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If using init/update/finalize, we need to xor any partial block data + ;; into the hash. +%if %%num_initial_blocks > 1 + ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place +%if %%num_initial_blocks != 16 + ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero + or %%LENGTH, %%LENGTH + je %%_after_reduction +%endif ; %%num_initial_blocks != 16 + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7 +%endif ; %%num_initial_blocks > 1 +%endif ; %%INSTANCE_TYPE, multi_call + +%%_after_reduction: + ;; Final hash is now in HASH_IN_OUT + +%endmacro ; INITIAL_BLOCKS_PARTIAL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main GCM macro stitching cipher with GHASH +;;; - operates on single stream +;;; - encrypts 8 blocks at a time +;;; - ghash the 8 previously encrypted ciphertext blocks +;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output +;;; contains encrypted counter block. +%macro GHASH_8_ENCRYPT_8_PARALLEL 34-37 +%define %%GDATA %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR1 %5 ; [in/out] ZMM counter blocks 0 to 3 +%define %%CTR2 %6 ; [in/out] ZMM counter blocks 4 to 7 +%define %%GHASHIN_AESOUT_B03 %7 ; [in/out] ZMM ghash in / aes out blocks 0 to 3 +%define %%GHASHIN_AESOUT_B47 %8 ; [in/out] ZMM ghash in / aes out blocks 4 to 7 +%define %%AES_PARTIAL_BLOCK %9 ; [out] XMM partial block (AES) +%define %%loop_idx %10 ; [in] counter block prep selection "add+shuffle" or "add" +%define %%ENC_DEC %11 ; [in] cipher direction +%define %%FULL_PARTIAL %12 ; [in] last block type selection "full" or "partial" +%define %%IA0 %13 ; [clobbered] temporary GP register +%define %%IA1 %14 ; [clobbered] temporary GP register +%define %%LENGTH %15 ; [in] length +%define %%INSTANCE_TYPE %16 ; [in] 'single_call' or 'multi_call' selection +%define %%GH4KEY %17 ; [in] ZMM with GHASH keys 4 to 1 +%define %%GH8KEY %18 ; [in] ZMM with GHASH keys 8 to 5 +%define %%SHFMSK %19 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT1 %20 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %21 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %22 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %23 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %24 ; [clobbered] temporary ZMM (cipher) +%define %%ZT10 %25 ; [clobbered] temporary ZMM (ghash) +%define %%ZT11 %26 ; [clobbered] temporary ZMM (ghash) +%define %%ZT12 %27 ; [clobbered] temporary ZMM (ghash) +%define %%ZT13 %28 ; [clobbered] temporary ZMM (ghash) +%define %%ZT14 %29 ; [clobbered] temporary ZMM (ghash) +%define %%ZT15 %30 ; [clobbered] temporary ZMM (ghash) +%define %%ZT16 %31 ; [clobbered] temporary ZMM (ghash) +%define %%ZT17 %32 ; [clobbered] temporary ZMM (ghash) +%define %%MASKREG %33 ; [clobbered] mask register for partial loads/stores +%define %%DO_REDUCTION %34 ; [in] "reduction", "no_reduction", "final_reduction" +%define %%TO_REDUCE_L %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction" +%define %%TO_REDUCE_H %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction" +%define %%TO_REDUCE_M %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction" + +%define %%GH1H %%ZT10 +%define %%GH1L %%ZT11 +%define %%GH1M1 %%ZT12 +%define %%GH1M2 %%ZT13 + +%define %%GH2H %%ZT14 +%define %%GH2L %%ZT15 +%define %%GH2M1 %%ZT16 +%define %%GH2M2 %%ZT17 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; populate counter blocks for cipher part +%ifidn %%loop_idx, in_order + ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro + ;; it has to be kept in unshuffled format + vpshufb %%ZT1, %%CTR1, %%SHFMSK + vpshufb %%ZT2, %%CTR2, %%SHFMSK +%else + vmovdqa64 %%ZT1, %%CTR1 + vmovdqa64 %%ZT2, %%CTR2 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stitch AES rounds with GHASH + +%assign aes_round 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 0 - ARK + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) + + ;;================================================== + ;; GHASH 4 blocks + vpclmulqdq %%GH1H, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11 ; a1*b1 + vpclmulqdq %%GH1L, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00 ; a0*b0 + vpclmulqdq %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01 ; a1*b0 + vpclmulqdq %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10 ; a0*b1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 3 AES rounds +%rep 3 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 3 x AES ROUND + + ;; ================================================= + ;; GHASH 4 blocks + vpclmulqdq %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10 ; a0*b1 + vpclmulqdq %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 3 AES rounds +%rep 3 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 3 x AES ROUND + + ;; ================================================= + ;; gather GHASH in GH1L (low) and GH1H (high) +%ifidn %%DO_REDUCTION, no_reduction + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpternlogq %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2 + vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH: TO_REDUCE_H ^= GH1H ^ GH2H + vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL: TO_REDUCE_L ^= GH1L ^ GH2L +%endif +%ifidn %%DO_REDUCTION, do_reduction + ;; phase 1: add mid products together + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpxorq %%GH1M1, %%GH1M1, %%GH2M2 + + vpsrldq %%GH2M1, %%GH1M1, 8 + vpslldq %%GH1M1, %%GH1M1, 8 +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; phase 1: add mid products together + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpternlogq %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2 + + vpsrldq %%GH2M1, %%GH1M1, 8 + vpslldq %%GH1M1, %%GH1M1, 8 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 2 x AES ROUND + + ;; ================================================= + ;; Add mid product to high and low then + ;; horizontal xor of low and high 4x128 +%ifidn %%DO_REDUCTION, final_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpxorq %%GH1H, %%TO_REDUCE_H + vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64 + vpxorq %%GH1L, %%TO_REDUCE_L +%endif +%ifidn %%DO_REDUCTION, do_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64 +%endif +%ifnidn %%DO_REDUCTION, no_reduction + VHPXORI4x128 %%GH1H, %%GH2H + VHPXORI4x128 %%GH1L, %%GH2L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;; ================================================= + ;; first phase of reduction +%ifnidn %%DO_REDUCTION, no_reduction + vmovdqu64 XWORD(%%GH2M2), [rel POLY2] + vpclmulqdq XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01 + vpslldq XWORD(%%ZT15), XWORD(%%ZT15), 8 ; shift-L 2 DWs + vpxorq XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;; ================================================= + ;; second phase of the reduction +%ifnidn %%DO_REDUCTION, no_reduction + vpclmulqdq XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00 + vpsrldq XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R + + vpclmulqdq XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10 + vpslldq XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts + ;; ZT13 = ZT13 xor ZT16 xor GH1H + vpternlogq XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; all remaining AES rounds but the last +%rep (NROUNDS + 2) +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load/store mask (partial case) and load the text data +%ifidn %%FULL_PARTIAL, full + vmovdqu8 %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] +%else + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH + sub %%IA1, 64 + kmovq %%MASKREG, [%%IA0 + 8*%%IA1] + vmovdqu8 %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; the last AES round (NROUNDS + 1) and XOR against plain/cipher text + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store the cipher/plain text data +%ifidn %%FULL_PARTIAL, full + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2 +%else + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2 +%endif + + ;; ================================================= + ;; prep cipher text blocks for the next ghash round + +%ifnidn %%FULL_PARTIAL, full +%ifidn %%INSTANCE_TYPE, multi_call + ;; for partial block & multi_call we need encrypted counter block + vpxorq %%ZT3, %%ZT2, %%ZT5 + vextracti32x4 %%AES_PARTIAL_BLOCK, %%ZT3, 3 +%endif + ;; for GHASH computation purpose clear the top bytes of the partial block +%ifidn %%ENC_DEC, ENC + vmovdqu8 %%ZT2{%%MASKREG}{z}, %%ZT2 +%else + vmovdqu8 %%ZT5{%%MASKREG}{z}, %%ZT5 +%endif +%endif ; %ifnidn %%FULL_PARTIAL, full + + ;; ================================================= + ;; shuffle cipher text blocks for GHASH computation +%ifidn %%ENC_DEC, ENC + vpshufb %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK + vpshufb %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK +%else + vpshufb %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK + vpshufb %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK +%endif + +%ifidn %%DO_REDUCTION, do_reduction + ;; ================================================= + ;; XOR current GHASH value (ZT13) into block 0 + vpxorq %%GHASHIN_AESOUT_B03, %%ZT13 +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; ================================================= + ;; Return GHASH value (ZT13) in TO_REDUCE_L + vmovdqa64 %%TO_REDUCE_L, %%ZT13 +%endif + +%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main GCM macro stitching cipher with GHASH +;;; - operates on single stream +;;; - encrypts 16 blocks at a time +;;; - ghash the 16 previously encrypted ciphertext blocks +;;; - no partial block or multi_call handling here +%macro GHASH_16_ENCRYPT_16_PARALLEL 42 +%define %%GDATA %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR_BE %5 ; [in/out] ZMM counter blocks (last 4) in big-endian +%define %%CTR_CHECK %6 ; [in/out] GP with 8-bit counter for overflow check +%define %%HASHKEY_OFFSET %7 ; [in] numerical offset for the highest hash key +%define %%AESOUT_BLK_OFFSET %8 ; [in] numerical offset for AES-CTR out +%define %%GHASHIN_BLK_OFFSET %9 ; [in] numerical offset for GHASH blocks in +%define %%SHFMSK %10 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT1 %11 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %12 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %13 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %14 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction) +%define %%ZT6 %16 ; [clobbered] temporary ZMM (cipher) +%define %%ZT7 %17 ; [clobbered] temporary ZMM (cipher) +%define %%ZT8 %18 ; [clobbered] temporary ZMM (cipher) +%define %%ZT9 %19 ; [clobbered] temporary ZMM (cipher) +%define %%ZT10 %20 ; [clobbered] temporary ZMM (ghash) +%define %%ZT11 %21 ; [clobbered] temporary ZMM (ghash) +%define %%ZT12 %22 ; [clobbered] temporary ZMM (ghash) +%define %%ZT13 %23 ; [clobbered] temporary ZMM (ghash) +%define %%ZT14 %24 ; [clobbered] temporary ZMM (ghash) +%define %%ZT15 %25 ; [clobbered] temporary ZMM (ghash) +%define %%ZT16 %26 ; [clobbered] temporary ZMM (ghash) +%define %%ZT17 %27 ; [clobbered] temporary ZMM (ghash) +%define %%ZT18 %28 ; [clobbered] temporary ZMM (ghash) +%define %%ZT19 %29 ; [clobbered] temporary ZMM +%define %%ZT20 %30 ; [clobbered] temporary ZMM +%define %%ZT21 %31 ; [clobbered] temporary ZMM +%define %%ZT22 %32 ; [clobbered] temporary ZMM +%define %%ZT23 %33 ; [clobbered] temporary ZMM +%define %%ADDBE_4x4 %34 ; [in] ZMM with 4x128bits 4 in big-endian +%define %%ADDBE_1234 %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian +%define %%TO_REDUCE_L %36 ; [in/out] ZMM for low 4x128-bit GHASH sum +%define %%TO_REDUCE_H %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum +%define %%TO_REDUCE_M %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum +%define %%DO_REDUCTION %39 ; [in] "no_reduction", "final_reduction", "first_time" +%define %%ENC_DEC %40 ; [in] cipher direction +%define %%DATA_DISPL %41 ; [in] fixed numerical data displacement/offset +%define %%GHASH_IN %42 ; [in] current GHASH value or "no_ghash_in" + +%define %%B00_03 %%ZT1 +%define %%B04_07 %%ZT2 +%define %%B08_11 %%ZT3 +%define %%B12_15 %%ZT4 + +%define %%GH1H %%ZT5 ; @note: do not change this mapping +%define %%GH1L %%ZT6 +%define %%GH1M %%ZT7 +%define %%GH1T %%ZT8 + +%define %%GH2H %%ZT9 +%define %%GH2L %%ZT10 +%define %%GH2M %%ZT11 +%define %%GH2T %%ZT12 + +%define %%RED_POLY %%GH2T +%define %%RED_P1 %%GH2L +%define %%RED_T1 %%GH2H +%define %%RED_T2 %%GH2M + +%define %%GH3H %%ZT13 +%define %%GH3L %%ZT14 +%define %%GH3M %%ZT15 +%define %%GH3T %%ZT16 + +%define %%DATA1 %%ZT13 +%define %%DATA2 %%ZT14 +%define %%DATA3 %%ZT15 +%define %%DATA4 %%ZT16 + +%define %%AESKEY1 %%ZT17 +%define %%AESKEY2 %%ZT18 + +%define %%GHKEY1 %%ZT19 +%define %%GHKEY2 %%ZT20 +%define %%GHDAT1 %%ZT21 +%define %%GHDAT2 %%ZT22 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prepare counter blocks + + cmp BYTE(%%CTR_CHECK), (256 - 16) + jae %%_16_blocks_overflow + vpaddd %%B00_03, %%CTR_BE, %%ADDBE_1234 + vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4 + vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4 + vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4 + jmp %%_16_blocks_ok +%%_16_blocks_overflow: + vpshufb %%CTR_BE, %%CTR_BE, %%SHFMSK + vmovdqa64 %%B12_15, [rel ddq_add_4444] + vpaddd %%B00_03, %%CTR_BE, [rel ddq_add_1234] + vpaddd %%B04_07, %%B00_03, %%B12_15 + vpaddd %%B08_11, %%B04_07, %%B12_15 + vpaddd %%B12_15, %%B08_11, %%B12_15 + vpshufb %%B00_03, %%SHFMSK + vpshufb %%B04_07, %%SHFMSK + vpshufb %%B08_11, %%SHFMSK + vpshufb %%B12_15, %%SHFMSK +%%_16_blocks_ok: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; pre-load constants + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)] +%ifnidn %%GHASH_IN, no_ghash_in + vpxorq %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)] +%else + vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)] +%endif + vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; save counter for the next round + ;; increment counter overflow check register + vshufi64x2 %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b + add BYTE(%%CTR_CHECK), 16 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; pre-load constants + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)] + vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)] + vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stitch AES rounds with GHASH + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 0 - ARK + + vpxorq %%B00_03, %%AESKEY1 + vpxorq %%B04_07, %%AESKEY1 + vpxorq %%B08_11, %%AESKEY1 + vpxorq %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)] + + ;;================================================== + ;; GHASH 4 blocks (15 to 12) + vpclmulqdq %%GH1H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1 + vpclmulqdq %%GH1L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0 + vpclmulqdq %%GH1M, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0 + vpclmulqdq %%GH1T, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1 + + vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)] + vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 1 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)] + + ;; ================================================= + ;; GHASH 4 blocks (11 to 8) + vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1 + vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0 + + vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)] + vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 2 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)] + + ;; ================================================= + ;; GHASH 4 blocks (7 to 4) + vpclmulqdq %%GH3M, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1 + vpclmulqdq %%GH3T, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0 + vpclmulqdq %%GH3H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1 + vpclmulqdq %%GH3L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 3 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)] + + ;; ================================================= + ;; Gather (XOR) GHASH for 12 blocks + vpternlogq %%GH1H, %%GH2H, %%GH3H, 0x96 + vpternlogq %%GH1L, %%GH2L, %%GH3L, 0x96 + vpternlogq %%GH1T, %%GH2T, %%GH3T, 0x96 + vpternlogq %%GH1M, %%GH2M, %%GH3M, 0x96 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 4 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load plain/cipher text (recycle GH3xx registers) + VX512LDR %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)] + VX512LDR %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)] + VX512LDR %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)] + VX512LDR %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 5 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)] + + ;; ================================================= + ;; GHASH 4 blocks (3 to 0) + vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1 + vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 6 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)] + + ;; ================================================= + ;; gather GHASH in GH1L (low) and GH1H (high) +%ifidn %%DO_REDUCTION, first_time + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpxorq %%TO_REDUCE_M, %%GH1M, %%GH2M ; TM + vpxorq %%TO_REDUCE_H, %%GH1H, %%GH2H ; TH + vpxorq %%TO_REDUCE_L, %%GH1L, %%GH2L ; TL +%endif +%ifidn %%DO_REDUCTION, no_reduction + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpternlogq %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM + vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH + vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; phase 1: add mid products together + ;; also load polynomial constant for reduction + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpternlogq %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96 + + vpsrldq %%GH2M, %%GH1M, 8 + vpslldq %%GH1M, %%GH1M, 8 + + vmovdqa64 XWORD(%%RED_POLY), [rel POLY2] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 7 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)] + + ;; ================================================= + ;; Add mid product to high and low +%ifidn %%DO_REDUCTION, final_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpxorq %%GH1H, %%TO_REDUCE_H + vpternlogq %%GH1L, %%GH2L, %%GH1M, 0x96 ; TL = TL1 + TL2 + TM<<64 + vpxorq %%GH1L, %%TO_REDUCE_L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 8 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)] + + ;; ================================================= + ;; horizontal xor of low and high 4x128 +%ifidn %%DO_REDUCTION, final_reduction + VHPXORI4x128 %%GH1H, %%GH2H + VHPXORI4x128 %%GH1L, %%GH2L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 9 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%if (NROUNDS >= 11) + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)] +%endif + ;; ================================================= + ;; first phase of reduction +%ifidn %%DO_REDUCTION, final_reduction + vpclmulqdq XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01 + vpslldq XWORD(%%RED_P1), XWORD(%%RED_P1), 8 ; shift-L 2 DWs + vpxorq XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds up to 11 (AES192) or 13 (AES256) + ;; AES128 is done +%if (NROUNDS >= 11) + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%if (NROUNDS == 13) + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last) +%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last) + + ;; ================================================= + ;; second phase of the reduction +%ifidn %%DO_REDUCTION, final_reduction + vpclmulqdq XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00 + vpsrldq XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R + + vpclmulqdq XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10 + vpslldq XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts + ;; GH1H = GH1H x RED_T1 x RED_T2 + vpternlogq XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; the last AES round + vaesenclast %%B00_03, %%B00_03, %%AESKEY1 + vaesenclast %%B04_07, %%B04_07, %%AESKEY1 + vaesenclast %%B08_11, %%B08_11, %%AESKEY1 + vaesenclast %%B12_15, %%B12_15, %%AESKEY1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; XOR against plain/cipher text + vpxorq %%B00_03, %%B00_03, %%DATA1 + vpxorq %%B04_07, %%B04_07, %%DATA2 + vpxorq %%B08_11, %%B08_11, %%DATA3 + vpxorq %%B12_15, %%B12_15, %%DATA4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store cipher/plain text + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15 + + ;; ================================================= + ;; shuffle cipher text blocks for GHASH computation +%ifidn %%ENC_DEC, ENC + vpshufb %%B00_03, %%B00_03, %%SHFMSK + vpshufb %%B04_07, %%B04_07, %%SHFMSK + vpshufb %%B08_11, %%B08_11, %%SHFMSK + vpshufb %%B12_15, %%B12_15, %%SHFMSK +%else + vpshufb %%B00_03, %%DATA1, %%SHFMSK + vpshufb %%B04_07, %%DATA2, %%SHFMSK + vpshufb %%B08_11, %%DATA3, %%SHFMSK + vpshufb %%B12_15, %%DATA4, %%SHFMSK +%endif + + ;; ================================================= + ;; store shuffled cipher text for ghashing + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15 + +%ifidn %%DO_REDUCTION, final_reduction + ;; ================================================= + ;; Return GHASH value through %%GH1H +%endif + +%endmacro ; GHASH_16_ENCRYPT_16_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH the last 8 ciphertext blocks. +;;; - optionally accepts GHASH product sums as input +%macro GHASH_LAST_8 10-13 +%define %%GDATA %1 ; [in] key pointer +%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7 +%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3 +%define %%ZTH %4 ; [cloberred] ZMM temporary +%define %%ZTM %5 ; [cloberred] ZMM temporary +%define %%ZTL %6 ; [cloberred] ZMM temporary +%define %%ZT01 %7 ; [cloberred] ZMM temporary +%define %%ZT02 %8 ; [cloberred] ZMM temporary +%define %%ZT03 %9 ; [cloberred] ZMM temporary +%define %%AAD_HASH %10 ; [out] XMM hash value +%define %%GH %11 ; [in/optional] ZMM with GHASH high product sum +%define %%GL %12 ; [in/optional] ZMM with GHASH low product sum +%define %%GM %13 ; [in/optional] ZMM with GHASH mid product sum + + VCLMUL_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL + +%if %0 > 10 + ;; add optional sums before step2 + vpxorq %%ZTH, %%ZTH, %%GH + vpxorq %%ZTL, %%ZTL, %%GL + vpxorq %%ZTM, %%ZTM, %%GM +%endif + + VCLMUL_STEP2 %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL + + vmovdqa64 XWORD(%%ZT03), [rel POLY2] + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \ + XWORD(%%ZT01), XWORD(%%ZT02) +%endmacro ; GHASH_LAST_8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH the last 7 cipher text blocks. +;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist +;;; - it loads GHASH keys for each of the data blocks, so that: +;;; - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively +;;; - code ensures that unused block 7 and corresponding GHASH key are zeroed +;;; (clmul product is zero this way and will not affect the result) +;;; - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively +;;; - optionally accepts GHASH product sums as input +%macro GHASH_LAST_7 13-16 +%define %%GDATA %1 ; [in] key pointer +%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7 +%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3 +%define %%ZTH %4 ; [cloberred] ZMM temporary +%define %%ZTM %5 ; [cloberred] ZMM temporary +%define %%ZTL %6 ; [cloberred] ZMM temporary +%define %%ZT01 %7 ; [cloberred] ZMM temporary +%define %%ZT02 %8 ; [cloberred] ZMM temporary +%define %%ZT03 %9 ; [cloberred] ZMM temporary +%define %%ZT04 %10 ; [cloberred] ZMM temporary +%define %%AAD_HASH %11 ; [out] XMM hash value +%define %%MASKREG %12 ; [clobbered] mask register to use for loads +%define %%IA0 %13 ; [clobbered] GP temporary register +%define %%GH %14 ; [in/optional] ZMM with GHASH high product sum +%define %%GL %15 ; [in/optional] ZMM with GHASH low product sum +%define %%GM %16 ; [in/optional] ZMM with GHASH mid product sum + + vmovdqa64 XWORD(%%ZT04), [rel POLY2] + + VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7 + +%if %0 > 13 + ;; add optional sums before step2 + vpxorq %%ZTH, %%ZTH, %%GH + vpxorq %%ZTL, %%ZTL, %%GL + vpxorq %%ZTM, %%ZTM, %%GM +%endif + + VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \ + %%ZT01, %%ZT02, %%ZT03, \ + %%ZTH, %%ZTM, %%ZTL, 7 + + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \ + XWORD(%%ZT01), XWORD(%%ZT02) +%endmacro ; GHASH_LAST_7 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxorq %%XMM0, %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Save register content for the caller +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + mov rax, rsp + + sub rsp, STACK_FRAME_SIZE + and rsp, ~63 + + mov [rsp + STACK_GP_OFFSET + 0*8], r12 + mov [rsp + STACK_GP_OFFSET + 1*8], r13 + mov [rsp + STACK_GP_OFFSET + 2*8], r14 + mov [rsp + STACK_GP_OFFSET + 3*8], r15 + mov [rsp + STACK_GP_OFFSET + 4*8], rax ; stack + mov r14, rax ; r14 is used to retrieve stack args + mov [rsp + STACK_GP_OFFSET + 5*8], rbp + mov [rsp + STACK_GP_OFFSET + 6*8], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + STACK_GP_OFFSET + 7*8], rdi + mov [rsp + STACK_GP_OFFSET + 8*8], rsi +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6 + vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7 + vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8 + vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9 + vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10 + vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11 + vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12 + vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13 + vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14 + vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15 +%endif +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Restore register content for the caller +%macro FUNC_RESTORE 0 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_zmms_asm +%else + vzeroupper +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16] + vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16] + vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16] + vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16] + vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16] + vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16] + vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16] + vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16] + vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16] + vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16] +%endif + + ;; Required for Update/GMC_ENC + mov rbp, [rsp + STACK_GP_OFFSET + 5*8] + mov rbx, [rsp + STACK_GP_OFFSET + 6*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + STACK_GP_OFFSET + 7*8] + mov rsi, [rsp + STACK_GP_OFFSET + 8*8] +%endif + mov r12, [rsp + STACK_GP_OFFSET + 0*8] + mov r13, [rsp + STACK_GP_OFFSET + 1*8] + mov r14, [rsp + STACK_GP_OFFSET + 2*8] + mov r15, [rsp + STACK_GP_OFFSET + 3*8] + mov rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +;;; Additional Authentication data (A_IN), Additional Data length (A_LEN). +;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 21 +%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer +%define %%GDATA_CTX %2 ; [in] GCM context pointer +%define %%IV %3 ; [in] IV pointer +%define %%A_IN %4 ; [in] AAD pointer +%define %%A_LEN %5 ; [in] AAD length in bytes +%define %%GPR1 %6 ; [clobbered] GP register +%define %%GPR2 %7 ; [clobbered] GP register +%define %%GPR3 %8 ; [clobbered] GP register +%define %%MASKREG %9 ; [clobbered] mask register +%define %%AAD_HASH %10 ; [out] XMM for AAD_HASH value (xmm14) +%define %%CUR_COUNT %11 ; [out] XMM with current counter (xmm2) +%define %%ZT0 %12 ; [clobbered] ZMM register +%define %%ZT1 %13 ; [clobbered] ZMM register +%define %%ZT2 %14 ; [clobbered] ZMM register +%define %%ZT3 %15 ; [clobbered] ZMM register +%define %%ZT4 %16 ; [clobbered] ZMM register +%define %%ZT5 %17 ; [clobbered] ZMM register +%define %%ZT6 %18 ; [clobbered] ZMM register +%define %%ZT7 %19 ; [clobbered] ZMM register +%define %%ZT8 %20 ; [clobbered] ZMM register +%define %%ZT9 %21 ; [clobbered] ZMM register + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \ + %%GPR1, %%GPR2, %%GPR3, %%MASKREG + + mov %%GPR1, %%A_LEN + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx.aad_length = aad_length + + xor %%GPR1, %%GPR1 + mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0 + + ;; read 12 IV bytes and pad with 0x00000001 + vmovdqu8 %%CUR_COUNT, [rel ONEf] + mov %%GPR2, %%IV + mov %%GPR1, 0x0000_0000_0000_0fff + kmovq %%MASKREG, %%GPR1 + vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1 + + vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv + + ;; store IV as counter in LE format + vpshufb %%CUR_COUNT, [rel SHUF_MASK] + vmovdqu [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Cipher and ghash of payloads shorter than 256 bytes +;;; - number of blocks in the message comes as argument +;;; - depending on the number of blocks an optimized variant of +;;; INITIAL_BLOCKS_PARTIAL is invoked +%macro GCM_ENC_DEC_SMALL 42 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%ENC_DEC %6 ; [in] cipher direction +%define %%DATA_OFFSET %7 ; [in] data offset +%define %%LENGTH %8 ; [in] data length +%define %%NUM_BLOCKS %9 ; [in] number of blocks to process 1 to 16 +%define %%CTR %10 ; [in/out] XMM counter block +%define %%HASH_IN_OUT %11 ; [in/out] XMM GHASH value +%define %%INSTANCE_TYPE %12 ; [in] single or multi call +%define %%ZTMP0 %13 ; [clobbered] ZMM register +%define %%ZTMP1 %14 ; [clobbered] ZMM register +%define %%ZTMP2 %15 ; [clobbered] ZMM register +%define %%ZTMP3 %16 ; [clobbered] ZMM register +%define %%ZTMP4 %17 ; [clobbered] ZMM register +%define %%ZTMP5 %18 ; [clobbered] ZMM register +%define %%ZTMP6 %19 ; [clobbered] ZMM register +%define %%ZTMP7 %20 ; [clobbered] ZMM register +%define %%ZTMP8 %21 ; [clobbered] ZMM register +%define %%ZTMP9 %22 ; [clobbered] ZMM register +%define %%ZTMP10 %23 ; [clobbered] ZMM register +%define %%ZTMP11 %24 ; [clobbered] ZMM register +%define %%ZTMP12 %25 ; [clobbered] ZMM register +%define %%ZTMP13 %26 ; [clobbered] ZMM register +%define %%ZTMP14 %27 ; [clobbered] ZMM register +%define %%ZTMP15 %28 ; [clobbered] ZMM register +%define %%ZTMP16 %29 ; [clobbered] ZMM register +%define %%ZTMP17 %30 ; [clobbered] ZMM register +%define %%ZTMP18 %31 ; [clobbered] ZMM register +%define %%ZTMP19 %32 ; [clobbered] ZMM register +%define %%ZTMP20 %33 ; [clobbered] ZMM register +%define %%ZTMP21 %34 ; [clobbered] ZMM register +%define %%ZTMP22 %35 ; [clobbered] ZMM register +%define %%GH %36 ; [in] ZMM ghash sum (high) +%define %%GL %37 ; [in] ZMM ghash sum (low) +%define %%GM %38 ; [in] ZMM ghash sum (middle) +%define %%IA0 %39 ; [clobbered] GP register +%define %%IA1 %40 ; [clobbered] GP register +%define %%MASKREG %41 ; [clobbered] mask register +%define %%SHUFMASK %42 ; [in] ZMM with BE/LE shuffle mask + + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + jl %%_small_initial_num_blocks_is_7_1 + + + cmp %%NUM_BLOCKS, 12 + je %%_small_initial_num_blocks_is_12 + jl %%_small_initial_num_blocks_is_11_9 + + ;; 16, 15, 14 or 13 + cmp %%NUM_BLOCKS, 16 + je %%_small_initial_num_blocks_is_16 + cmp %%NUM_BLOCKS, 15 + je %%_small_initial_num_blocks_is_15 + cmp %%NUM_BLOCKS, 14 + je %%_small_initial_num_blocks_is_14 + jmp %%_small_initial_num_blocks_is_13 + +%%_small_initial_num_blocks_is_11_9: + ;; 11, 10 or 9 + cmp %%NUM_BLOCKS, 11 + je %%_small_initial_num_blocks_is_11 + cmp %%NUM_BLOCKS, 10 + je %%_small_initial_num_blocks_is_10 + jmp %%_small_initial_num_blocks_is_9 + +%%_small_initial_num_blocks_is_7_1: + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + jl %%_small_initial_num_blocks_is_3_1 + ;; 7, 6 or 5 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + jmp %%_small_initial_num_blocks_is_5 + +%%_small_initial_num_blocks_is_3_1: + ;; 3, 2 or 1 + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed + + ;; Use rep to generate different block size variants + ;; - one block size has to be the first one +%assign num_blocks 1 +%rep 16 +%%_small_initial_num_blocks_is_ %+ num_blocks : + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \ + %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFMASK +%if num_blocks != 16 + jmp %%_small_initial_blocks_encrypted +%endif +%assign num_blocks (num_blocks + 1) +%endrep + +%%_small_initial_blocks_encrypted: + +%endmacro ; GCM_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and zmm0-zmm31, k1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 7 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer pointer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer pointer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%ENC_DEC %6 ; [in] cipher direction +%define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection + +%define %%IA0 r10 +%define %%IA1 r12 +%define %%IA2 r13 +%define %%IA3 r15 +%define %%IA4 r11 +%define %%IA5 rax + +%define %%LENGTH %%IA2 +%define %%CTR_CHECK %%IA3 +%define %%DATA_OFFSET %%IA4 + +%define %%HASHK_PTR %%IA5 + +%define %%GCM_INIT_CTR_BLOCK xmm2 ; hardcoded in GCM_INIT for now + +%define %%AES_PARTIAL_BLOCK xmm8 +%define %%CTR_BLOCK2z zmm18 +%define %%CTR_BLOCKz zmm9 +%define %%CTR_BLOCKx xmm9 +%define %%AAD_HASHz zmm14 +%define %%AAD_HASHx xmm14 + +;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL +%define %%ZTMP0 zmm0 +%define %%ZTMP1 zmm3 +%define %%ZTMP2 zmm4 +%define %%ZTMP3 zmm5 +%define %%ZTMP4 zmm6 +%define %%ZTMP5 zmm7 +%define %%ZTMP6 zmm10 +%define %%ZTMP7 zmm11 +%define %%ZTMP8 zmm12 +%define %%ZTMP9 zmm13 +%define %%ZTMP10 zmm15 +%define %%ZTMP11 zmm16 +%define %%ZTMP12 zmm17 + +;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL +;;; - some used by8 code as well through TMPxy names +%define %%ZTMP13 zmm19 +%define %%ZTMP14 zmm20 +%define %%ZTMP15 zmm21 +%define %%ZTMP16 zmm30 ; can be used in very/big_loop part +%define %%ZTMP17 zmm31 ; can be used in very/big_loop part +%define %%ZTMP18 zmm1 +%define %%ZTMP19 zmm2 +%define %%ZTMP20 zmm8 +%define %%ZTMP21 zmm22 +%define %%ZTMP22 zmm23 + +;;; Free to use: zmm24 - zmm29 +;;; - used by by128/48 and by8 +%define %%GH zmm24 +%define %%GL zmm25 +%define %%GM zmm26 +%define %%SHUF_MASK zmm29 +%define %%CTR_BLOCK_SAVE zmm28 + +;;; - used by by128/48 code only +%define %%ADDBE_4x4 zmm27 +%define %%ADDBE_1234 zmm28 ; conflicts with CTR_BLOCK_SAVE + +;; used by8 code only +%define %%GH4KEY %%ZTMP17 +%define %%GH8KEY %%ZTMP16 +%define %%BLK0 %%ZTMP18 +%define %%BLK1 %%ZTMP19 +%define %%ADD8BE zmm27 +%define %%ADD8LE %%ZTMP13 + +%define %%MASKREG k1 + +%ifdef GCM_BIG_DATA +;; reduction every 128 blocks, depth 32 blocks +;; @note 128 blocks is the maximum capacity of the stack frame when +;; GCM_BIG_DATA is defined +%assign very_big_loop_nblocks 128 +%assign very_big_loop_depth 32 +%endif + +;; reduction every 48 blocks, depth 32 blocks +;; @note 48 blocks is the maximum capacity of the stack frame when +;; GCM_BIG_DATA is not defined +%assign big_loop_nblocks 48 +%assign big_loop_depth 32 + +;;; Macro flow: +;;; - for message size bigger than very_big_loop_nblocks process data +;;; with "very_big_loop" parameters +;;; - for message size bigger than big_loop_nblocks process data +;;; with "big_loop" parameters +;;; - calculate the number of 16byte blocks in the message +;;; - process (number of 16byte blocks) mod 8 +;;; '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new + +%ifidn __OUTPUT_FORMAT__, win64 + cmp %%PLAIN_CYPH_LEN, 0 +%else + or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN +%endif + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + + ;; Update length of data processed +%ifidn __OUTPUT_FORMAT__, win64 + mov %%IA0, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], %%IA0 +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN +%endif + vmovdqu64 %%AAD_HASHx, [%%GDATA_CTX + AadHash] + +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: partial block processing makes only sense for multi_call here. + ;; Used for the update flow - if there was a previous partial + ;; block fill the remaining bytes here. + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \ + %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG +%endif + + ;; lift counter block from GCM_INIT to here +%ifidn %%INSTANCE_TYPE, single_call + vmovdqu64 %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK +%else + vmovdqu64 %%CTR_BLOCKx, [%%GDATA_CTX + CurCount] +%endif + + ;; Save the amount of data left to process in %%LENGTH + mov %%LENGTH, %%PLAIN_CYPH_LEN +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%DATA_OFFSET is zero in single_call case. + ;; Consequently PLAIN_CYPH_LEN will never be zero after + ;; %%DATA_OFFSET subtraction below. + ;; There may be no more data if it was consumed in the partial block. + sub %%LENGTH, %%DATA_OFFSET + je %%_enc_dec_done +%endif ; %%INSTANCE_TYPE, multi_call + + vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] + vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444] + +%ifdef GCM_BIG_DATA + vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234] + + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jl %%_message_below_very_big_nblocks + + INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth + + sub %%LENGTH, (very_big_loop_nblocks * 16) + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jl %%_no_more_very_big_nblocks + +%%_encrypt_very_big_nblocks: + GHASH_ENCRYPT_Nx16_PARALLEL \ + %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%CTR_BLOCKz, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \ + %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK + + sub %%LENGTH, (very_big_loop_nblocks * 16) + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jge %%_encrypt_very_big_nblocks + +%%_no_more_very_big_nblocks: + vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK) + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth + + or %%LENGTH, %%LENGTH + jz %%_ghash_done + +%%_message_below_very_big_nblocks: +%endif ; GCM_BIG_DATA + + cmp %%LENGTH, (big_loop_nblocks * 16) + jl %%_message_below_big_nblocks + + ;; overwritten above by CTR_BLOCK_SAVE + vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234] + + INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth + + sub %%LENGTH, (big_loop_nblocks * 16) + cmp %%LENGTH, (big_loop_nblocks * 16) + jl %%_no_more_big_nblocks + +%%_encrypt_big_nblocks: + GHASH_ENCRYPT_Nx16_PARALLEL \ + %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%CTR_BLOCKz, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \ + %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK + + sub %%LENGTH, (big_loop_nblocks * 16) + cmp %%LENGTH, (big_loop_nblocks * 16) + jge %%_encrypt_big_nblocks + +%%_no_more_big_nblocks: + vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK) + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth + + or %%LENGTH, %%LENGTH + jz %%_ghash_done + +%%_message_below_big_nblocks: + + ;; Less than 256 bytes will be handled by the small message code, which + ;; can process up to 16 x blocks (16 bytes each) + cmp %%LENGTH, (16 * 16) + jge %%_large_message_path + + ;; Determine how many blocks to process + ;; - process one additional block if there is a partial block + mov %%IA1, %%LENGTH + add %%IA1, 15 + shr %%IA1, 4 + ;; %%IA1 can be in the range from 0 to 16 + + GCM_ENC_DEC_SMALL \ + %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \ + %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + no_zmm, no_zmm, no_zmm, \ + %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK + + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + jmp %%_ghash_done + +%%_large_message_path: + ;; Determine how many blocks to process in INITIAL + ;; - process one additional block in INITIAL if there is a partial block + mov %%IA1, %%LENGTH + and %%IA1, 0xff + add %%IA1, 15 + shr %%IA1, 4 + ;; Don't allow 8 INITIAL blocks since this will + ;; be handled by the x8 partial loop. + and %%IA1, 7 + je %%_initial_num_blocks_is_0 + cmp %%IA1, 1 + je %%_initial_num_blocks_is_1 + cmp %%IA1, 2 + je %%_initial_num_blocks_is_2 + cmp %%IA1, 3 + je %%_initial_num_blocks_is_3 + cmp %%IA1, 4 + je %%_initial_num_blocks_is_4 + cmp %%IA1, 5 + je %%_initial_num_blocks_is_5 + cmp %%IA1, 6 + je %%_initial_num_blocks_is_6 + +%assign number_of_blocks 7 +%rep 8 +%%_initial_num_blocks_is_ %+ number_of_blocks: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block +%if number_of_blocks != 0 + jmp %%_initial_blocks_encrypted +%endif +%assign number_of_blocks (number_of_blocks - 1) +%endrep + +%%_initial_blocks_encrypted: + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + ;; move cipher blocks from intial blocks to input of by8 macro + ;; and for GHASH_LAST_8/7 + ;; - ghash value already xor'ed into block 0 + vmovdqa64 %%BLK0, %%ZTMP0 + vmovdqa64 %%BLK1, %%ZTMP1 + + ;; The entire message cannot get processed in INITIAL_BLOCKS + ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks + ;; - INITIAL_BLOCKS processes up to 15 blocks + ;; - no need to check for zero length at this stage + + ;; In order to have only one reduction at the end + ;; start HASH KEY pointer needs to be determined based on length and + ;; call type. + ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and + ;; subtracted from LENGTH + lea %%IA1, [%%LENGTH + (8 * 16)] + add %%IA1, 15 + and %%IA1, 0x3f0 +%ifidn %%INSTANCE_TYPE, multi_call + ;; if partial block and multi_call then change hash key start by one + mov %%IA0, %%LENGTH + and %%IA0, 15 + add %%IA0, 15 + and %%IA0, 16 + sub %%IA1, %%IA0 +%endif + lea %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16] + sub %%HASHK_PTR, %%IA1 + ;; HASHK_PTR + ;; - points at the first hash key to start GHASH with + ;; - needs to be updated as the message is processed (incremented) + + ;; pre-load constants + vmovdqa64 %%ADD8BE, [rel ddq_addbe_8888] + vmovdqa64 %%ADD8LE, [rel ddq_add_8888] + vpxorq %%GH, %%GH + vpxorq %%GL, %%GL + vpxorq %%GM, %%GM + + ;; prepare counter 8 blocks + vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0 + vpaddd %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678] + vpaddd %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234] + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + + ;; Process 7 full blocks plus a partial block + cmp %%LENGTH, 128 + jl %%_encrypt_by_8_partial + +%%_encrypt_by_8_parallel: + ;; in_order vs. out_order is an optimization to increment the counter + ;; without shuffling it back into little endian. + ;; %%CTR_CHECK keeps track of when we need to increment in order so + ;; that the carry is handled correctly. + + vmovq %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE) + +%%_encrypt_by_8_new: + and WORD(%%CTR_CHECK), 255 + add WORD(%%CTR_CHECK), 8 + + vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)] + vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)] + + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\ + %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \ + out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \ + %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \ + %%MASKREG, no_reduction, %%GL, %%GH, %%GM + + add %%HASHK_PTR, (8 * 16) + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 + jz %%_encrypt_done + + cmp WORD(%%CTR_CHECK), (256 - 8) + jae %%_encrypt_by_8 + + vpaddd %%CTR_BLOCKz, %%ADD8BE + vpaddd %%CTR_BLOCK2z, %%ADD8BE + + cmp %%LENGTH, 128 + jl %%_encrypt_by_8_partial + + jmp %%_encrypt_by_8_new + +%%_encrypt_by_8: + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + vpaddd %%CTR_BLOCKz, %%ADD8LE + vpaddd %%CTR_BLOCK2z, %%ADD8LE + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + + cmp %%LENGTH, 128 + jge %%_encrypt_by_8_new + +%%_encrypt_by_8_partial: + ;; Test to see if we need a by 8 with partial block. At this point + ;; bytes remaining should be either zero or between 113-127. + ;; 'in_order' shuffle needed to align key for partial block xor. + ;; 'out_order' is a little faster because it avoids extra shuffles. + ;; - counter blocks for the next 8 blocks are prepared and in BE format + ;; - we can go ahead with out_order scenario + + vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)] + vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)] + + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \ + %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \ + out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \ + %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \ + %%MASKREG, no_reduction, %%GL, %%GH, %%GM + + add %%HASHK_PTR, (8 * 16) + add %%DATA_OFFSET, (128 - 16) + sub %%LENGTH, (128 - 16) + +%ifidn %%INSTANCE_TYPE, multi_call + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK +%endif + +%%_encrypt_done: + ;; Extract the last counter block in LE format + vextracti32x4 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3 + vpshufb XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK) + + ;; GHASH last cipher text blocks in xmm1-xmm8 + ;; - if block 8th is partial in a multi-call path then skip the block +%ifidn %%INSTANCE_TYPE, multi_call + cmp qword [%%GDATA_CTX + PBlockLen], 0 + jz %%_hash_last_8 + + ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1 + vextracti32x4 XWORD(%%ZTMP7), %%BLK1, 3 + + GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM + + ;; XOR the partial word into the hash + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7) + jmp %%_ghash_done +%%_hash_last_8: +%endif + GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \ + %%GH, %%GL, %%GM +%%_ghash_done: + vmovdqu64 [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE) + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASHx +%%_enc_dec_done: + +%endmacro ; GCM_ENC_DEC + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt/decrypt the initial 16 blocks +%macro INITIAL_BLOCKS_16 22 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%KP %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits) +%define %%CTR %6 ; [in] ZMM with CTR BE blocks 4x128 bits +%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check +%define %%ADDBE_4x4 %8 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %9 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%T0 %10 ; [clobered] temporary ZMM register +%define %%T1 %11 ; [clobered] temporary ZMM register +%define %%T2 %12 ; [clobered] temporary ZMM register +%define %%T3 %13 ; [clobered] temporary ZMM register +%define %%T4 %14 ; [clobered] temporary ZMM register +%define %%T5 %15 ; [clobered] temporary ZMM register +%define %%T6 %16 ; [clobered] temporary ZMM register +%define %%T7 %17 ; [clobered] temporary ZMM register +%define %%T8 %18 ; [clobered] temporary ZMM register +%define %%SHUF_MASK %19 ; [in] ZMM with BE/LE shuffle mask +%define %%ENC_DEC %20 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%BLK_OFFSET %21 ; [in] stack frame offset to ciphered blocks +%define %%DATA_DISPL %22 ; [in] fixed numerical data displacement/offset + +%define %%B00_03 %%T5 +%define %%B04_07 %%T6 +%define %%B08_11 %%T7 +%define %%B12_15 %%T8 + +%assign stack_offset (%%BLK_OFFSET) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prepare counter blocks + + cmp BYTE(%%CTR_CHECK), (256 - 16) + jae %%_next_16_overflow + vpaddd %%B00_03, %%CTR, %%ADDBE_1234 + vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4 + vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4 + vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4 + jmp %%_next_16_ok +%%_next_16_overflow: + vpshufb %%CTR, %%CTR, %%SHUF_MASK + vmovdqa64 %%B12_15, [rel ddq_add_4444] + vpaddd %%B00_03, %%CTR, [rel ddq_add_1234] + vpaddd %%B04_07, %%B00_03, %%B12_15 + vpaddd %%B08_11, %%B04_07, %%B12_15 + vpaddd %%B12_15, %%B08_11, %%B12_15 + vpshufb %%B00_03, %%SHUF_MASK + vpshufb %%B04_07, %%SHUF_MASK + vpshufb %%B08_11, %%SHUF_MASK + vpshufb %%B12_15, %%SHUF_MASK +%%_next_16_ok: + vshufi64x2 %%CTR, %%B12_15, %%B12_15, 1111_1111b + add BYTE(%%CTR_CHECK), 16 + + ;; === load 16 blocks of data + VX512LDR %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)] + VX512LDR %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)] + VX512LDR %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)] + VX512LDR %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)] + + ;; move to AES encryption rounds +%assign i 0 + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vpxorq %%B00_03, %%B00_03, %%T4 + vpxorq %%B04_07, %%B04_07, %%T4 + vpxorq %%B08_11, %%B08_11, %%T4 + vpxorq %%B12_15, %%B12_15, %%T4 +%assign i (i + 1) + +%rep NROUNDS + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vaesenc %%B00_03, %%B00_03, %%T4 + vaesenc %%B04_07, %%B04_07, %%T4 + vaesenc %%B08_11, %%B08_11, %%T4 + vaesenc %%B12_15, %%B12_15, %%T4 +%assign i (i + 1) +%endrep + + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vaesenclast %%B00_03, %%B00_03, %%T4 + vaesenclast %%B04_07, %%B04_07, %%T4 + vaesenclast %%B08_11, %%B08_11, %%T4 + vaesenclast %%B12_15, %%B12_15, %%T4 + + ;; xor against text + vpxorq %%B00_03, %%B00_03, %%T0 + vpxorq %%B04_07, %%B04_07, %%T1 + vpxorq %%B08_11, %%B08_11, %%T2 + vpxorq %%B12_15, %%B12_15, %%T3 + + ;; store + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15 + +%ifidn %%ENC_DEC, DEC + ;; decryption - cipher text needs to go to GHASH phase + vpshufb %%B00_03, %%T0, %%SHUF_MASK + vpshufb %%B04_07, %%T1, %%SHUF_MASK + vpshufb %%B08_11, %%T2, %%SHUF_MASK + vpshufb %%B12_15, %%T3, %%SHUF_MASK +%else + ;; encryption + vpshufb %%B00_03, %%B00_03, %%SHUF_MASK + vpshufb %%B04_07, %%B04_07, %%SHUF_MASK + vpshufb %%B08_11, %%B08_11, %%SHUF_MASK + vpshufb %%B12_15, %%B12_15, %%SHUF_MASK +%endif + +%ifnidn %%GHASH, no_ghash + ;; === xor cipher block 0 with GHASH for the next GHASH round + vpxorq %%B00_03, %%B00_03, %%GHASH +%endif + + vmovdqa64 [rsp + stack_offset + (0 * 64)], %%B00_03 + vmovdqa64 [rsp + stack_offset + (1 * 64)], %%B04_07 + vmovdqa64 [rsp + stack_offset + (2 * 64)], %%B08_11 + vmovdqa64 [rsp + stack_offset + (3 * 64)], %%B12_15 +%endmacro ;INITIAL_BLOCKS_16 + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt the initial N x 16 blocks +;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth) +;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed +;;; - A + B = N +%macro INITIAL_BLOCKS_Nx16 39 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%KP %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in/out] data offset +%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits) +%define %%CTR %6 ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b +%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check +%define %%T0 %8 ; [clobered] temporary ZMM register +%define %%T1 %9 ; [clobered] temporary ZMM register +%define %%T2 %10 ; [clobered] temporary ZMM register +%define %%T3 %11 ; [clobered] temporary ZMM register +%define %%T4 %12 ; [clobered] temporary ZMM register +%define %%T5 %13 ; [clobered] temporary ZMM register +%define %%T6 %14 ; [clobered] temporary ZMM register +%define %%T7 %15 ; [clobered] temporary ZMM register +%define %%T8 %16 ; [clobered] temporary ZMM register +%define %%T9 %17 ; [clobered] temporary ZMM register +%define %%T10 %18 ; [clobered] temporary ZMM register +%define %%T11 %19 ; [clobered] temporary ZMM register +%define %%T12 %20 ; [clobered] temporary ZMM register +%define %%T13 %21 ; [clobered] temporary ZMM register +%define %%T14 %22 ; [clobered] temporary ZMM register +%define %%T15 %23 ; [clobered] temporary ZMM register +%define %%T16 %24 ; [clobered] temporary ZMM register +%define %%T17 %25 ; [clobered] temporary ZMM register +%define %%T18 %26 ; [clobered] temporary ZMM register +%define %%T19 %27 ; [clobered] temporary ZMM register +%define %%T20 %28 ; [clobered] temporary ZMM register +%define %%T21 %29 ; [clobered] temporary ZMM register +%define %%T22 %30 ; [clobered] temporary ZMM register +%define %%GH %31 ; [out] ZMM ghash sum (high) +%define %%GL %32 ; [out] ZMM ghash sum (low) +%define %%GM %33 ; [out] ZMM ghash sum (middle) +%define %%ADDBE_4x4 %34 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %35 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%SHUF_MASK %36 ; [in] ZMM with BE/LE shuffle mask +%define %%ENC_DEC %37 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%NBLOCKS %38 ; [in] number of blocks: multiple of 16 +%define %%DEPTH_BLK %39 ; [in] pipline depth, number of blocks (mulitple of 16) + +%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign hkey_offset HashKey_ %+ %%NBLOCKS +%assign data_in_out_offset 0 + + ;; set up CTR_CHECK + vmovd DWORD(%%CTR_CHECK), XWORD(%%CTR) + and DWORD(%%CTR_CHECK), 255 + + ;; in LE format after init, convert to BE + vshufi64x2 %%CTR, %%CTR, %%CTR, 0 + vpshufb %%CTR, %%CTR, %%SHUF_MASK + + ;; ==== AES lead in + + ;; first 16 blocks - just cipher + INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \ + %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \ + %%T0, %%T1, %%T2, %%T3, %%T4, \ + %%T5, %%T6, %%T7, %%T8, \ + %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) + +%if (%%DEPTH_BLK > 16) +%rep ((%%DEPTH_BLK - 16) / 16) + INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \ + no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \ + %%T0, %%T1, %%T2, %%T3, %%T4, \ + %%T5, %%T6, %%T7, %%T8, \ + %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%endrep +%endif + + ;; ==== GHASH + AES follows + + ;; first 16 blocks stitched + GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \ + %%T0, %%T1, %%T2, %%T3, \ + %%T4, %%T5, %%T6, %%T7, \ + %%T8, %%T9, %%T10, %%T11,\ + %%T12, %%T13, %%T14, %%T15,\ + %%T16, %%T17, %%T18, %%T19, \ + %%T20, %%T21, %%T22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GL, %%GH, %%GM, \ + first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%if ((%%NBLOCKS - %%DEPTH_BLK) > 16) +%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) + + ;; mid 16 blocks - stitched + GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \ + %%T0, %%T1, %%T2, %%T3, \ + %%T4, %%T5, %%T6, %%T7, \ + %%T8, %%T9, %%T10, %%T11,\ + %%T12, %%T13, %%T14, %%T15,\ + %%T16, %%T17, %%T18, %%T19, \ + %%T20, %%T21, %%T22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GL, %%GH, %%GM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in +%endrep +%endif + add %%DATA_OFFSET, (%%NBLOCKS * 16) + +%endmacro ;INITIAL_BLOCKS_Nx16 + +;;; =========================================================================== +;;; =========================================================================== +;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code) +%macro GHASH_LAST_Nx16 23 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%GHASH %2 ; [out] ghash output +%define %%T1 %3 ; [clobbered] temporary ZMM +%define %%T2 %4 ; [clobbered] temporary ZMM +%define %%T3 %5 ; [clobbered] temporary ZMM +%define %%T4 %6 ; [clobbered] temporary ZMM +%define %%T5 %7 ; [clobbered] temporary ZMM +%define %%T6 %8 ; [clobbered] temporary ZMM +%define %%T7 %9 ; [clobbered] temporary ZMM +%define %%T8 %10 ; [clobbered] temporary ZMM +%define %%T9 %11 ; [clobbered] temporary ZMM +%define %%T10 %12 ; [clobbered] temporary ZMM +%define %%T11 %13 ; [clobbered] temporary ZMM +%define %%T12 %14 ; [clobbered] temporary ZMM +%define %%T13 %15 ; [clobbered] temporary ZMM +%define %%T14 %16 ; [clobbered] temporary ZMM +%define %%T15 %17 ; [clobbered] temporary ZMM +%define %%T16 %18 ; [clobbered] temporary ZMM +%define %%GH %19 ; [in/cloberred] ghash sum (high) +%define %%GL %20 ; [in/cloberred] ghash sum (low) +%define %%GM %21 ; [in/cloberred] ghash sum (medium) +%define %%LOOP_BLK %22 ; [in] numerical number of blocks handled by the loop +%define %%DEPTH_BLK %23 ; [in] numerical number, pipeline depth (ghash vs aes) + +%define %%T0H %%T1 +%define %%T0L %%T2 +%define %%T0M1 %%T3 +%define %%T0M2 %%T4 + +%define %%T1H %%T5 +%define %%T1L %%T6 +%define %%T1M1 %%T7 +%define %%T1M2 %%T8 + +%define %%T2H %%T9 +%define %%T2L %%T10 +%define %%T2M1 %%T11 +%define %%T2M2 %%T12 + +%define %%BLK1 %%T13 +%define %%BLK2 %%T14 + +%define %%HK1 %%T15 +%define %%HK2 %%T16 + +%assign hashk HashKey_ %+ %%DEPTH_BLK +%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16)) + + ;; load cipher blocks and ghash keys + vmovdqa64 %%BLK1, [rsp + cipher_blk] + vmovdqa64 %%BLK2, [rsp + cipher_blk + 64] + vmovdqu64 %%HK1, [%%KP + hashk] + vmovdqu64 %%HK2, [%%KP + hashk + 64] + ;; ghash blocks 0-3 + vpclmulqdq %%T0H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%T0L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%T0M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%T0M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1 + ;; ghash blocks 4-7 + vpclmulqdq %%T1H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1 + vpclmulqdq %%T1L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0 + vpclmulqdq %%T1M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0 + vpclmulqdq %%T1M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1 + vpternlogq %%T0H, %%T1H, %%GH, 0x96 ; T0H = T0H + T1H + GH + vpternlogq %%T0L, %%T1L, %%GL, 0x96 ; T0L = T0L + T1L + GL + vpternlogq %%T0M1, %%T1M1, %%GM, 0x96 ; T0M1 = T0M1 + T1M1 + GM + vpxorq %%T0M2, %%T0M2, %%T1M2 ; T0M2 = T0M2 + T1M2 + +%rep ((%%DEPTH_BLK - 8) / 8) +%assign hashk (hashk + 128) +%assign cipher_blk (cipher_blk + 128) + + ;; remaining blocks + ;; load next 8 cipher blocks and corresponding ghash keys + vmovdqa64 %%BLK1, [rsp + cipher_blk] + vmovdqa64 %%BLK2, [rsp + cipher_blk + 64] + vmovdqu64 %%HK1, [%%KP + hashk] + vmovdqu64 %%HK2, [%%KP + hashk + 64] + ;; ghash blocks 0-3 + vpclmulqdq %%T1H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%T1L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%T1M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%T1M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1 + ;; ghash blocks 4-7 + vpclmulqdq %%T2H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1 + vpclmulqdq %%T2L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0 + vpclmulqdq %%T2M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0 + vpclmulqdq %%T2M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1 + ;; update sums + vpternlogq %%T0H, %%T1H, %%T2H, 0x96 ; TH = T0H + T1H + T2H + vpternlogq %%T0L, %%T1L, %%T2L, 0x96 ; TL = T0L + T1L + T2L + vpternlogq %%T0M1, %%T1M1, %%T2M1, 0x96 ; TM1 = T0M1 + T1M1 xor T2M1 + vpternlogq %%T0M2, %%T1M2, %%T2M2, 0x96 ; TM2 = T0M2 + T1M1 xor T2M2 +%endrep + + ;; integrate TM into TH and TL + vpxorq %%T0M1, %%T0M1, %%T0M2 + vpsrldq %%T1M1, %%T0M1, 8 + vpslldq %%T1M2, %%T0M1, 8 + vpxorq %%T0H, %%T0H, %%T1M1 + vpxorq %%T0L, %%T0L, %%T1M2 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%T0H, %%T2M1 + VHPXORI4x128 %%T0L, %%T2M2 + + ;; reduction + vmovdqa64 %%HK1, [rel POLY2] + VCLMUL_REDUCE %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2 +%endmacro + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt & ghash multiples of 16 blocks + +%macro GHASH_ENCRYPT_Nx16_PARALLEL 39 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%GDATA_KEY %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in/out] data offset +%define %%CTR_BE %5 ; [in/out] ZMM last counter block +%define %%SHFMSK %6 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT0 %7 ; [clobered] temporary ZMM register +%define %%ZT1 %8 ; [clobered] temporary ZMM register +%define %%ZT2 %9 ; [clobered] temporary ZMM register +%define %%ZT3 %10 ; [clobered] temporary ZMM register +%define %%ZT4 %11 ; [clobered] temporary ZMM register +%define %%ZT5 %12 ; [clobered] temporary ZMM register +%define %%ZT6 %13 ; [clobered] temporary ZMM register +%define %%ZT7 %14 ; [clobered] temporary ZMM register +%define %%ZT8 %15 ; [clobered] temporary ZMM register +%define %%ZT9 %16 ; [clobered] temporary ZMM register +%define %%ZT10 %17 ; [clobered] temporary ZMM register +%define %%ZT11 %18 ; [clobered] temporary ZMM register +%define %%ZT12 %19 ; [clobered] temporary ZMM register +%define %%ZT13 %20 ; [clobered] temporary ZMM register +%define %%ZT14 %21 ; [clobered] temporary ZMM register +%define %%ZT15 %22 ; [clobered] temporary ZMM register +%define %%ZT16 %23 ; [clobered] temporary ZMM register +%define %%ZT17 %24 ; [clobered] temporary ZMM register +%define %%ZT18 %25 ; [clobered] temporary ZMM register +%define %%ZT19 %26 ; [clobered] temporary ZMM register +%define %%ZT20 %27 ; [clobered] temporary ZMM register +%define %%ZT21 %28 ; [clobered] temporary ZMM register +%define %%ZT22 %29 ; [clobered] temporary ZMM register +%define %%GTH %30 ; [in/out] ZMM GHASH sum (high) +%define %%GTL %31 ; [in/out] ZMM GHASH sum (low) +%define %%GTM %32 ; [in/out] ZMM GHASH sum (medium) +%define %%ADDBE_4x4 %33 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %34 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%GHASH %35 ; [clobbered] ZMM with intermidiate GHASH value +%define %%ENC_DEC %36 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%NUM_BLOCKS %37 ; [in] number of blocks to process in the loop +%define %%DEPTH_BLK %38 ; [in] pipeline depth in blocks +%define %%CTR_CHECK %39 ; [in/out] counter to check byte overflow + +%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16)) +%assign hkey_offset HashKey_ %+ %%DEPTH_BLK +%assign data_in_out_offset 0 + + ;; mid 16 blocks +%if (%%DEPTH_BLK > 16) +%rep ((%%DEPTH_BLK - 16) / 16) + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%endrep +%endif + + ;; 16 blocks with reduction + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS + + ;; === xor cipher block 0 with GHASH (ZT4) + vmovdqa64 %%GHASH, %%ZT4 + + ;; start the pipeline again + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + first_time, %%ENC_DEC, data_in_out_offset, %%GHASH + +%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16) +%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16) + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) + + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in +%endrep +%endif + + add %%DATA_OFFSET, (%%NUM_BLOCKS * 16) + +%endmacro ;GHASH_ENCRYPT_Nx16_PARALLEL +;;; =========================================================================== + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%INSTANCE_TYPE %6 +%define %%PLAIN_CYPH_LEN rax + + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + ;; Start AES as early as possible + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If the GCM function is called as a single function call rather + ;; than invoking the individual parts (init, update, finalize) we + ;; can remove a write to read dependency on AadHash. + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + + ;; Encrypt the final partial block. If we did this as a single call then + ;; the partial block was handled in the main GCM_ENC_DEC macro. + mov r12, [%%GDATA_CTX + PBlockLen] + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + +%endif + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 + vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap + + vpxor xmm9, xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + + cmp r11, 8 + je %%_T_8 + + simd_store_avx_15 r10, xmm9, r11, r12, rax + jmp %%_return_T_done +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: + +%ifdef SAFE_DATA + ;; Clear sensitive data from context structure + vpxor xmm0, xmm0 + vmovdqu [%%GDATA_CTX + AadHash], xmm0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0 +%endif +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_vaes_avx512 / +; aes_gcm_precomp_192_vaes_avx512 / +; aes_gcm_precomp_256_vaes_avx512 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(precomp,_) +FN_NAME(precomp,_): + endbranch +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_precomp +%endif + + FUNC_SAVE + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + + FUNC_RESTORE +exit_precomp: + + ret +%endif ; _nt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(init,_) +FN_NAME(init,_): + endbranch + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_init + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_init + + ;; Check IV != NULL + cmp arg3, 0 + jz exit_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + cmp arg4, 0 + jz exit_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + +exit_init: + + FUNC_RESTORE + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 / +; aes_gcm_enc_256_update_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_update_) +FN_NAME(enc,_update_): + endbranch + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_enc + +skip_in_out_check_update_enc: +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + +exit_update_enc: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 / +; aes_gcm_dec_256_update_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_update_) +FN_NAME(dec,_update_): + endbranch + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_dec + +skip_in_out_check_update_dec: +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + +exit_update_dec: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 / +; aes_gcm_enc_256_finalize_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(enc,_finalize_) +FN_NAME(enc,_finalize_): + endbranch + +;; All parameters are passed through registers +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_enc_fin + + cmp arg4, 16 + ja exit_enc_fin +%endif + + FUNC_SAVE + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call + + FUNC_RESTORE + +exit_enc_fin: + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512 +; aes_gcm_dec_256_finalize_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifnidn FUNCT_EXTENSION, _nt +global FN_NAME(dec,_finalize_) +FN_NAME(dec,_finalize_): + endbranch + +;; All parameters are passed through registers +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_dec_fin + + cmp arg4, 16 + ja exit_dec_fin +%endif + + FUNC_SAVE + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call + + FUNC_RESTORE + +exit_dec_fin: + ret +%endif ; _nt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(enc,_) +FN_NAME(enc,_): + endbranch + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_enc + + cmp arg10, 16 + ja exit_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call + +exit_enc: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +global FN_NAME(dec,_) +FN_NAME(dec,_): + endbranch + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_dec + + cmp arg10, 16 + ja exit_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_dec + +skip_aad_check_dec: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call + +exit_dec: + FUNC_RESTORE + ret + +%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows. +%ifidn __OUTPUT_FORMAT__, win64 +global no_ %+ FN_NAME(avx512,_) +no_ %+ FN_NAME(avx512,_) %+ : +%endif +%endif ; (AS_FEATURE_LEVEL) >= 10 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h new file mode 100644 index 000000000..8287198ae --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h @@ -0,0 +1,476 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef AES_GCM_VECTORS_H_ +#define AES_GCM_VECTORS_H_ + +#include <stdint.h> + +typedef enum gcm_key_size { BITS_128 = 16, BITS_256 = 32 } gcm_key_size; +#define KBITS(K) (sizeof(K)) + +// struct to hold pointers to the key, plaintext and ciphertext vectors +typedef struct gcm_vector { + uint8_t* K; // AES Key + gcm_key_size Klen; // length of key in bits + uint8_t* IV; // initial value used by GCM + uint64_t IVlen; // length of IV in bytes + uint8_t* A; // additional authenticated data + uint64_t Alen; // length of AAD in bytes + uint8_t* P; // Plain text + uint64_t Plen; // length of our plaintext + //outputs of encryption + uint8_t* C; // same length as PT + uint8_t* T; // Authentication tag + uint8_t Tlen; // AT length can be 0 to 128bits +} gcm_vector; + +/////// +// 60-Byte Packet Encryption Using GCM-AES-128 +// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf +// K: AD7A2BD03EAC835A6F620FDCB506B345 +// IV: 12153524C0895E81B2C28465 +// AAD: D609B1F056637A0D46DF998D88E52E00 +// B2C2846512153524C0895E81 +// P: 08000F101112131415161718191A1B1C +// 1D1E1F202122232425262728292A2B2C +// 2D2E2F303132333435363738393A0002 +// C: 701AFA1CC039C0D765128A665DAB6924 +// 3899BF7318CCDC81C9931DA17FBE8EDD +// 7D17CB8B4C26FC81E3284F2B7FBA713D +// AT: 4F8D55E7D3F06FD5A13C0C29B9D5B880 +// H: 73A23D80121DE2D5A850253FCF43120E +/////// +static uint8_t K1[] = {0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45}; +static uint8_t P1[] = { + 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C + , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C + , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02 +}; +static uint8_t IV1[] = {0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65}; +static uint8_t A1[] = { + 0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00 + , 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81 +}; +#define A1_len sizeof(A1) +static uint8_t C1[] = { + 0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24 + , 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD + , 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D +}; +static uint8_t T1[] = { + 0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80 +}; + + +/////// +// 54-Byte Packet Encryption Using GCM-AES-128 +// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf +// K: 071B113B0CA743FECCCF3D051F737382 +// IV: F0761E8DCD3D000176D457ED +// AAD: E20106D7CD0DF0761E8DCD3D88E54C2A +// 76D457ED +// P: 08000F101112131415161718191A1B1C +// 1D1E1F202122232425262728292A2B2C +// 2D2E2F30313233340004 +// C: 13B4C72B389DC5018E72A171DD85A5D3 +// 752274D3A019FBCAED09A425CD9B2E1C +// 9B72EEE7C9DE7D52B3F3 +// AT: D6A5284F4A6D3FE22A5D6C2B960494C3 +// H: E4E01725D724C1215C7309AD34539257 +/////// +static uint8_t K2[] = {0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82}; +static uint8_t P2[] = { + 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C + , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C + , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04 +}; +static uint8_t IV2[] = {0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED}; +//static uint8_t IV1p[] = {0, 0, 0, 1}; +static uint8_t A2[] = { + 0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A + , 0x76, 0xD4, 0x57, 0xED +}; +#define A2_len sizeof(A2) +static uint8_t C2[] = { + 0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3 + , 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C + , 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3 +}; +static uint8_t T2[] = { + 0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3 +}; + + +/////// +// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp +// [Keylen = 128] +// [IVlen = 96] +// [PTlen = 128] +// [AADlen = 128] +// [Taglen = 128] +// Count = 0 +// K: c939cc13397c1d37de6ae0e1cb7c423c +// IV: b3d8cc017cbb89b39e0f67e2 +// P: c3b3c41f113a31b73d9a5cd432103069 +// AAD: 24825602bd12a984e0092d3e448eda5f +// C: 93fe7d9e9bfd10348a5606e5cafa7354 +// AT: 0032a1dc85f1c9786925a2e71d8272dd +/////// +static uint8_t K3[] = {0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c}; +static uint8_t IV3[] = {0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2}; +static uint8_t P3[] = {0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69}; +static uint8_t A3[] = {0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f}; +#define A3_len sizeof(A3) +static uint8_t C3[] = {0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54}; +static uint8_t T3[] = {0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd}; + +/////// +// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp +// [Keylen = 128] +// [IVlen = 96] +// [PTlen = 256] +// [AADlen = 128] +// [Taglen = 128] +// Count = 0 +// K = 298efa1ccf29cf62ae6824bfc19557fc +// IV = 6f58a93fe1d207fae4ed2f6d +// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901 +// AAD = 021fafd238463973ffe80256e5b1c6b1 +// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db +// T = 542465ef599316f73a7a560509a2d9f2 +/////// +static uint8_t K4[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc}; +static uint8_t IV4[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d}; +static uint8_t P4[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01}; +static uint8_t A4[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1}; +#define A4_len sizeof(A4) +static uint8_t C4[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb}; +static uint8_t T4[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2}; + +/////// +// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp +// [Keylen = 128] +// [IVlen = 96] +// [PTlen = 256] +// [AADlen = 128] +// [Taglen = 128] +// Count = 0 +// K = 298efa1ccf29cf62ae6824bfc19557fc +// IV = 6f58a93fe1d207fae4ed2f6d +// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901 +// AAD = 021fafd238463973ffe80256e5b1c6b1 +// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db +// T = 542465ef599316f73a7a560509a2d9f2 +/////// +static uint8_t K5[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc}; +static uint8_t IV5[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d}; +static uint8_t P5[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01}; +static uint8_t A5[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1}; +#define A5_len sizeof(A5) +static uint8_t C5[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb}; +static uint8_t T5[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2}; + + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 2 +// K: 00000000000000000000000000000000 +// P: 00000000000000000000000000000000 +// IV: 000000000000000000000000 +// C: 0388dace60b6a392f328c2b971b2fe78 +// T: ab6e47d42cec13bdf53a67b21257bddf +// H: 66e94bd4ef8a2c3b884cfa59ca342b2e +/////// +static uint8_t K6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static uint8_t P6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static uint8_t IV6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static uint8_t A6[] = {0}; +#define A6_len 0 +static uint8_t C6[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78}; +static uint8_t T6[] = {0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf}; + + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 3 +// K: feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b391aafd255 +// IV: cafebabefacedbaddecaf888 +// H: b83b533708bf535d0aa6e52980d53b78 +// C: 42831ec2217774244b7221b784d0d49c +// e3aa212f2c02a4e035c17e2329aca12e +// 21d514b25466931c7d8f6a5aac84aa05 +// 1ba30b396a0aac973d58e091473f5985 +// T: 4d5c2af327cd64a62cf35abd2ba6fab4 +/////// +static uint8_t K7[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P7[] = {0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a + , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72 + , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25 + , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55}; +static uint8_t IV7[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t A7[] = {0}; +#define A7_len 0 +static uint8_t C7[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c + , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e + , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05 + , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85}; +static uint8_t T7[] = {0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 4 +// K: feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: cafebabefacedbaddecaf888 +// H: b83b533708bf535d0aa6e52980d53b78 +// C: 42831ec2217774244b7221b784d0d49c +// e3aa212f2c02a4e035c17e2329aca12e +// 21d514b25466931c7d8f6a5aac84aa05 +// 1ba30b396a0aac973d58e091 +// T: 5bc94fbc3221a5db94fae95ae7121a47 +/////// +static uint8_t K8[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P8[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a + , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72 + , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25 + , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39}; +static uint8_t A8[] = {0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef + , 0xab, 0xad, 0xda, 0xd2}; +#define A8_len sizeof(A8) +static uint8_t IV8[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t C8[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c + , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e + , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05 + , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85}; +static uint8_t T8[] = {0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 14 +// K: 00000000000000000000000000000000 +// 00000000000000000000000000000000 +// P: 00000000000000000000000000000000 +// A: +// IV: 000000000000000000000000 +// H: dc95c078a2408989ad48a21492842087 +// C: cea7403d4d606b6e074ec5d3baf39d18 +// T: d0d1c8a799996bf0265b98b5d48ab919 +/////// +static uint8_t K9[] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; +static uint8_t P9[] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, +}; +static uint8_t A9[] = {0}; +#define A9_len 0 +static uint8_t IV9[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; +static uint8_t C9[] = { + 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18 +}; +static uint8_t T9[] = {0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 15 +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b391aafd255 +// A: +// IV: cafebabefacedbaddecaf888 +// H: acbef20579b4b8ebce889bac8732dad7 +// C: 522dc1f099567d07f47f37a32a84427d +// 643a8cdcbfe5c0c97598a2bd2555d1aa +// 8cb08e48590dbb3da7b08b1056828838 +// c5f61e6393ba7a0abcc9f662898015ad +// T: b094dac5d93471bdec1a502270e3cc6c +/////// +static uint8_t K10[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P10[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 +}; +static uint8_t A10[] = {0}; +#define A10_len 0 +static uint8_t IV10[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t C10[] = { + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad +}; +static uint8_t T10[] = { + 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 16 +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: cafebabefacedbaddecaf888 +// H: acbef20579b4b8ebce889bac8732dad7 +// C: 522dc1f099567d07f47f37a32a84427d +// 643a8cdcbfe5c0c97598a2bd2555d1aa +// 8cb08e48590dbb3da7b08b1056828838 +// c5f61e6393ba7a0abcc9f662 +// T: 76fc6ece0f4e1768cddf8853bb2d551b +/////// +static uint8_t K11[] = { + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, + 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +static uint8_t P11[] = { + 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, + 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, + 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, + 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39 +}; +static uint8_t A11[] = { + 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, + 0xab, 0xad, 0xda, 0xd2}; +#define A11_len sizeof(A11) +static uint8_t IV11[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88}; +static uint8_t C11[] = { + 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, + 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, + 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, + 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62 +}; +static uint8_t T11[] = {0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 17 -- Not supported IV length less than 12 bytes +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: cafebabefacedbad +// H: acbef20579b4b8ebce889bac8732dad7 +// C: c3762df1ca787d32ae47c13bf19844cb +// af1ae14d0b976afac52ff7d79bba9de0 +// feb582d33934a4f0954cc2363bc73f78 +// 62ac430e64abe499f47c9b1f +// T: 3a337dbf46a792c45e454913fe2ea8f2 +/////// +//static uint8_t K12[] = { +// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, +// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08}; +//static uint8_t P12[] = { +// 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, +// 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, +// 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, +// 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39 +//}; +//static uint8_t A12[] = { +// 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, +// 0xab, 0xad, 0xda, 0xd2}; +//static uint8_t IV12[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad}; +//static uint8_t H12[] = { +// 0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7}; +//static uint8_t C12[] = { +// 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb, +// 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0, +// 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78, +// 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, 0xf4, 0x7c, 0x9b, 0x1f +//}; +//static uint8_t T12[] = { +// 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2}; + +/////// +// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf +// Test Case 18 -- Not supported IV length greater than 12 bytes +// K: feffe9928665731c6d6a8f9467308308 +// feffe9928665731c6d6a8f9467308308 +// P: d9313225f88406e5a55909c5aff5269a +// 86a7a9531534f7da2e4c303d8a318a72 +// 1c3c0c95956809532fcf0e2449a6b525 +// b16aedf5aa0de657ba637b39 +// A: feedfacedeadbeeffeedfacedeadbeef +// abaddad2 +// IV: 9313225df88406e555909c5aff5269aa +// 6a7a9538534f7da1e4c303d2a318a728 +// c3c0c95156809539fcf0e2429a6b5254 +// 16aedbf5a0de6a57a637b39b +// H: acbef20579b4b8ebce889bac8732dad7 +// C: 5a8def2f0c9e53f1f75d7853659e2a20 +// eeb2b22aafde6419a058ab4f6f746bf4 +// 0fc0c3b780f244452da3ebf1c5d82cde +// a2418997200ef82e44ae7e3f +// T: a44a8266ee1c8eb0c8b5d4cf5ae9f19a +/////// + + +#define vector(N) {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)} + +gcm_vector const gcm_vectors[] = { + //field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen}; + // original vector does not have a valid sub hash key + vector(1), + vector(2), + vector(3), + vector(4), + vector(5), + vector(6), + vector(7), + vector(8), + vector(9), + vector(10), + vector(11), + /* vector(12), -- IV of less than 16bytes are not supported */ +}; + +#endif /* AES_GCM_VECTORS_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm new file mode 100644 index 000000000..ddae6a4e7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm @@ -0,0 +1,328 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Routine to do AES key expansion + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +%macro key_expansion_128_sse 0 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 11111111b + shufps xmm3, xmm1, 00010000b + pxor xmm1, xmm3 + shufps xmm3, xmm1, 10001100b + pxor xmm1, xmm3 + pxor xmm1, xmm2 +%endmacro + +%macro key_expansion_128_avx 0 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 11111111b + vshufps xmm3, xmm3, xmm1, 00010000b + vpxor xmm1, xmm1, xmm3 + vshufps xmm3, xmm3, xmm1, 10001100b + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm2 +%endmacro + +%ifidn __OUTPUT_FORMAT__, elf64 +%define KEY rdi +%define EXP_ENC_KEYS rsi +%define EXP_DEC_KEYS rdx +%else +%define KEY rcx +%define EXP_ENC_KEYS rdx +%define EXP_DEC_KEYS r8 +%endif + + +; void aes_keyexp_128(UINT8 *key, +; UINT8 *enc_exp_keys, +; UINT8 *dec_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; arg 3: r8: pointer to expanded key array for decrypt +; +mk_global aes_keyexp_128_sse, function +aes_keyexp_128_sse: + endbranch + movdqu xmm1, [KEY] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*0], xmm1 + movdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory + pxor xmm3, xmm3 + + aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*1], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*9], xmm4 + + aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*2], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*8], xmm5 + + aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*3], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*7], xmm4 + + aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*4], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*6], xmm5 + + aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*5], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*5], xmm4 + + aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*6], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*4], xmm5 + + aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*7], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*3], xmm4 + + aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*8], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*2], xmm5 + + aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*9], xmm1 + aesimc xmm4, xmm1 + movdqu [EXP_DEC_KEYS + 16*1], xmm4 + + aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*10], xmm1 + movdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +mk_global aes_keyexp_128_avx, function +aes_keyexp_128_avx: + endbranch + vmovdqu xmm1, [KEY] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*0], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory + vpxor xmm3, xmm3, xmm3 + + vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*1], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*9], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*2], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*8], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*3], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*7], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*4], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*6], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*5], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*5], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*6], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*4], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*7], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*3], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*8], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*2], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*9], xmm1 + vaesimc xmm4, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*1], xmm4 + + vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*10], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; void aes_keyexp_128_enc_sse(UINT8 *key, +; UINT8 *enc_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; +mk_global aes_keyexp_128_enc_sse, function +aes_keyexp_128_enc_sse: + endbranch + movdqu xmm1, [KEY] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*0], xmm1 + pxor xmm3, xmm3 + + aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*1], xmm1 + + aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*2], xmm1 + + aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*3], xmm1 + + aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*4], xmm1 + + aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*5], xmm1 + + aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*6], xmm1 + + aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*7], xmm1 + + aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*8], xmm1 + + aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*9], xmm1 + + aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_sse + movdqu [EXP_ENC_KEYS + 16*10], xmm1 + + ret + +mk_global aes_keyexp_128_enc_avx, function +aes_keyexp_128_enc_avx: + endbranch + vmovdqu xmm1, [KEY] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*0], xmm1 + vpxor xmm3, xmm3, xmm3 + + vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*1], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*2], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*3], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*4], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*5], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*6], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*7], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*8], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*9], xmm1 + + vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 + key_expansion_128_avx + vmovdqu [EXP_ENC_KEYS + 16*10], xmm1 + + ret + diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm new file mode 100644 index 000000000..7cde5fb67 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm @@ -0,0 +1,274 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +%ifidn __OUTPUT_FORMAT__, elf64 +%define KEY rdi +%define EXP_ENC_KEYS rsi +%define EXP_DEC_KEYS rdx +%else +%define KEY rcx +%define EXP_ENC_KEYS rdx +%define EXP_DEC_KEYS r8 +%endif + + + + +%macro key_expansion_1_192_sse 1 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 11111111b + shufps xmm3, xmm1, 00010000b + pxor xmm1, xmm3 + shufps xmm3, xmm1, 10001100b + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [EXP_ENC_KEYS+%1], xmm1 +%endmacro + +; Calculate w10 and w11 using calculated w9 and known w4-w5 +%macro key_expansion_2_192_sse 1 + movdqu xmm5, xmm4 + pslldq xmm5, 4 + shufps xmm6, xmm1, 11110000b + pxor xmm6, xmm5 + pxor xmm4, xmm6 + pshufd xmm7, xmm4, 00001110b + movdqu [EXP_ENC_KEYS+%1], xmm7 +%endmacro + +%macro key_dec_192_sse 1 + movdqu xmm0, [EXP_ENC_KEYS + 16 * %1] + aesimc xmm1, xmm0 + movdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1 +%endmacro + + + + + +%macro key_expansion_1_192_avx 1 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 11111111b + vshufps xmm3, xmm3, xmm1, 00010000b + vpxor xmm1, xmm1, xmm3 + vshufps xmm3, xmm3, xmm1, 10001100b + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm2 + vmovdqu [EXP_ENC_KEYS+%1], xmm1 +%endmacro + +; Calculate w10 and w11 using calculated w9 and known w4-w5 +%macro key_expansion_2_192_avx 1 + vmovdqa xmm5, xmm4 + vpslldq xmm5, xmm5, 4 + vshufps xmm6, xmm6, xmm1, 11110000b + vpxor xmm6, xmm6, xmm5 + vpxor xmm4, xmm4, xmm6 + vpshufd xmm7, xmm4, 00001110b + vmovdqu [EXP_ENC_KEYS+%1], xmm7 +%endmacro + +%macro key_dec_192_avx 1 + vmovdqu xmm0, [EXP_ENC_KEYS + 16 * %1] + vaesimc xmm1, xmm0 + vmovdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1 +%endmacro + + + + +; void aes_keyexp_192(UINT8 *key, +; UINT8 *enc_exp_keys, +; UINT8 *dec_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; arg 3: r8: pointer to expanded key array for decrypt +; +mk_global aes_keyexp_192_sse, function +aes_keyexp_192_sse: + endbranch + +%ifnidn __OUTPUT_FORMAT__, elf64 + sub rsp, 16*2 + 8 + movdqu [rsp + 0*16], xmm6 + movdqu [rsp + 1*16], xmm7 +%endif + + movq xmm7, [KEY + 16] ; loading the AES key, 64 bits + movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion + pshufd xmm4, xmm7, 01001111b + movdqu xmm1, [KEY] ; loading the AES key, 128 bits + movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion + movdqu [EXP_DEC_KEYS + 16*0], xmm1 + movdqu [EXP_DEC_KEYS + 16*12], xmm1 + + pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion. + pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion. + + aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 + key_expansion_1_192_sse 24 + key_expansion_2_192_sse 40 + + aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 + key_expansion_1_192_sse 48 + key_expansion_2_192_sse 64 + + aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 + key_expansion_1_192_sse 72 + key_expansion_2_192_sse 88 + + aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 + key_expansion_1_192_sse 96 + key_expansion_2_192_sse 112 + + aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 + key_expansion_1_192_sse 120 + key_expansion_2_192_sse 136 + + aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 + key_expansion_1_192_sse 144 + key_expansion_2_192_sse 160 + + aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 + key_expansion_1_192_sse 168 + key_expansion_2_192_sse 184 + + aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 + key_expansion_1_192_sse 192 + +;;; we have already saved the 12 th key, which is pure input on the +;;; ENC key path + movdqu xmm0, [EXP_ENC_KEYS + 16 * 12] + movdqu [EXP_DEC_KEYS + 16*0], xmm0 +;;; generate remaining decrypt keys + key_dec_192_sse 1 + key_dec_192_sse 2 + key_dec_192_sse 3 + key_dec_192_sse 4 + key_dec_192_sse 5 + key_dec_192_sse 6 + key_dec_192_sse 7 + key_dec_192_sse 8 + key_dec_192_sse 9 + key_dec_192_sse 10 + key_dec_192_sse 11 + +%ifnidn __OUTPUT_FORMAT__, elf64 + movdqu xmm6, [rsp + 0*16] + movdqu xmm7, [rsp + 1*16] + add rsp, 16*2 + 8 +%endif + + ret + + + +mk_global aes_keyexp_192_avx, function +aes_keyexp_192_avx: + endbranch + +%ifnidn __OUTPUT_FORMAT__, elf64 + sub rsp, 16*2 + 8 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 +%endif + + vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits + vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion + vpshufd xmm4, xmm7, 01001111b + vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits + vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion + vmovdqu [EXP_DEC_KEYS + 16*0], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*12], xmm1 + + vpxor xmm3, xmm3, xmm3 + vpxor xmm6, xmm6, xmm6 + + vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 + key_expansion_1_192_avx 24 + key_expansion_2_192_avx 40 + + vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 + key_expansion_1_192_avx 48 + key_expansion_2_192_avx 64 + + vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 + key_expansion_1_192_avx 72 + key_expansion_2_192_avx 88 + + vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 + key_expansion_1_192_avx 96 + key_expansion_2_192_avx 112 + + vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 + key_expansion_1_192_avx 120 + key_expansion_2_192_avx 136 + + vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 + key_expansion_1_192_avx 144 + key_expansion_2_192_avx 160 + + vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 + key_expansion_1_192_avx 168 + key_expansion_2_192_avx 184 + + vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 + key_expansion_1_192_avx 192 + +;;; we have already saved the 12 th key, which is pure input on the +;;; ENC key path + vmovdqu xmm0, [EXP_ENC_KEYS + 16 * 12] + vmovdqu [EXP_DEC_KEYS + 16*0], xmm0 +;;; generate remaining decrypt keys + key_dec_192_avx 1 + key_dec_192_avx 2 + key_dec_192_avx 3 + key_dec_192_avx 4 + key_dec_192_avx 5 + key_dec_192_avx 6 + key_dec_192_avx 7 + key_dec_192_avx 8 + key_dec_192_avx 9 + key_dec_192_avx 10 + key_dec_192_avx 11 + +%ifnidn __OUTPUT_FORMAT__, elf64 + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*16] + add rsp, 16*2 + 8 +%endif + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm new file mode 100644 index 000000000..9b3eb7688 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm @@ -0,0 +1,286 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +; Routine to do AES key expansion + +%include "reg_sizes.asm" + +[bits 64] +default rel +section .text + +; Uses the f() function of the aeskeygenassist result +%macro key_expansion_256_sse 0 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 11111111b + shufps xmm3, xmm1, 00010000b + pxor xmm1, xmm3 + shufps xmm3, xmm1, 10001100b + pxor xmm1, xmm3 + pxor xmm1, xmm2 +%endmacro + +; Uses the SubWord function of the aeskeygenassist result +%macro key_expansion_256_sse_2 0 + ;; Assumes the xmm3 includes all zeros at this point. + pshufd xmm2, xmm2, 10101010b + shufps xmm3, xmm4, 00010000b + pxor xmm4, xmm3 + shufps xmm3, xmm4, 10001100b + pxor xmm4, xmm3 + pxor xmm4, xmm2 +%endmacro + +; Uses the f() function of the aeskeygenassist result +%macro key_expansion_256_avx 0 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 11111111b + vshufps xmm3, xmm3, xmm1, 00010000b + vpxor xmm1, xmm1, xmm3 + vshufps xmm3, xmm3, xmm1, 10001100b + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm2 +%endmacro + +; Uses the SubWord function of the aeskeygenassist result +%macro key_expansion_256_avx_2 0 + ;; Assumes the xmm3 includes all zeros at this point. + vpshufd xmm2, xmm2, 10101010b + vshufps xmm3, xmm3, xmm4, 00010000b + vpxor xmm4, xmm4, xmm3 + vshufps xmm3, xmm3, xmm4, 10001100b + vpxor xmm4, xmm4, xmm3 + vpxor xmm4, xmm4, xmm2 +%endmacro + +%ifidn __OUTPUT_FORMAT__, elf64 +%define KEY rdi +%define EXP_ENC_KEYS rsi +%define EXP_DEC_KEYS rdx +%else +%define KEY rcx +%define EXP_ENC_KEYS rdx +%define EXP_DEC_KEYS r8 +%endif + +; void aes_keyexp_256(UINT8 *key, +; UINT8 *enc_exp_keys, +; UINT8 *dec_exp_keys); +; +; arg 1: rcx: pointer to key +; arg 2: rdx: pointer to expanded key array for encrypt +; arg 3: r8: pointer to expanded key array for decrypt +; +mk_global aes_keyexp_256_sse, function +aes_keyexp_256_sse: + endbranch + movdqu xmm1, [KEY] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*0], xmm1 + movdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory + + movdqu xmm4, [KEY+16] ; loading the AES key + movdqu [EXP_ENC_KEYS + 16*1], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory + + pxor xmm3, xmm3 ; Required for the key_expansion. + + aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*2], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*12], xmm5 + + aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*3], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*11], xmm0 + + aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*4], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*10], xmm5 + + aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*5], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*9], xmm0 + + aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*6], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*8], xmm5 + + aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*7], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*7], xmm0 + + aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*8], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*6], xmm5 + + aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*9], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*5], xmm0 + + aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*10], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*4], xmm5 + + aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*11], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*3], xmm0 + + aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*12], xmm1 + aesimc xmm5, xmm1 + movdqu [EXP_DEC_KEYS + 16*2], xmm5 + + aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 + key_expansion_256_sse_2 + movdqu [EXP_ENC_KEYS + 16*13], xmm4 + aesimc xmm0, xmm4 + movdqu [EXP_DEC_KEYS + 16*1], xmm0 + + aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 + key_expansion_256_sse + movdqu [EXP_ENC_KEYS + 16*14], xmm1 + movdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret + + +mk_global aes_keyexp_256_avx, function +aes_keyexp_256_avx: + endbranch + vmovdqu xmm1, [KEY] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*0], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory + + vmovdqu xmm4, [KEY+16] ; loading the AES key + vmovdqu [EXP_ENC_KEYS + 16*1], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory + + vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion. + + vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*2], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*12], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*3], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*11], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*4], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*10], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*5], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*9], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*6], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*8], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*7], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*7], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*8], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*6], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*9], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*5], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*10], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*4], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*11], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*3], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*12], xmm1 + vaesimc xmm5, xmm1 + vmovdqu [EXP_DEC_KEYS + 16*2], xmm5 + + vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 + key_expansion_256_avx_2 + vmovdqu [EXP_ENC_KEYS + 16*13], xmm4 + vaesimc xmm0, xmm4 + vmovdqu [EXP_DEC_KEYS + 16*1], xmm0 + + vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 + key_expansion_256_avx + vmovdqu [EXP_ENC_KEYS + 16*14], xmm1 + vmovdqu [EXP_DEC_KEYS + 16*0], xmm1 + + ret diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm new file mode 100644 index 000000000..045649a64 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm @@ -0,0 +1,68 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern aes_keyexp_128_sse +extern aes_keyexp_128_avx +extern aes_keyexp_128_enc_sse +extern aes_keyexp_128_enc_avx + +extern aes_keyexp_192_sse +extern aes_keyexp_192_avx + +extern aes_keyexp_256_sse +extern aes_keyexp_256_avx + +%include "multibinary.asm" + + +;;;; +; instantiate aes_keyexp_128 interfaces +;;;; +mbin_interface aes_keyexp_128 +mbin_dispatch_init aes_keyexp_128, aes_keyexp_128_sse, aes_keyexp_128_avx, aes_keyexp_128_avx + +mbin_interface aes_keyexp_128_enc +mbin_dispatch_init aes_keyexp_128_enc, aes_keyexp_128_enc_sse, aes_keyexp_128_enc_avx, aes_keyexp_128_enc_avx + +mbin_interface aes_keyexp_192 +mbin_dispatch_init aes_keyexp_192, aes_keyexp_192_sse, aes_keyexp_192_avx, aes_keyexp_192_avx + +mbin_interface aes_keyexp_256 +mbin_dispatch_init aes_keyexp_256, aes_keyexp_256_sse, aes_keyexp_256_avx, aes_keyexp_256_avx + +section .text +;;; func core, ver, snum +slversion aes_keyexp_128, 00, 01, 02a1 +slversion aes_keyexp_192, 00, 01, 02a2 +slversion aes_keyexp_256, 00, 01, 02a3 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h new file mode 100644 index 000000000..80c6e1e87 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h @@ -0,0 +1,302 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#ifndef AES_OSSL_HELPER_H_ +#define AES_OSSL_HELPER_H_ + +#ifdef _MSC_VER +# define inline __inline +#endif + +#include <openssl/evp.h> + +static inline + int openssl_aes_128_cbc_dec(uint8_t * key, uint8_t * iv, + int len, uint8_t * cyphertext, uint8_t * plaintext) +{ + int outlen = 0, tmplen = 0; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_DecryptUpdate(ctx, plaintext, &outlen, (uint8_t const *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_DecryptFinal_ex(ctx, &plaintext[outlen], &tmplen)) + printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_128_cbc %x, %x, %x\n", len, + outlen, tmplen); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_128_cbc_enc(uint8_t * key, uint8_t * iv, + int len, uint8_t * plaintext, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_EncryptUpdate + (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n"); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_192_cbc_dec(uint8_t * key, uint8_t * iv, + int len, uint8_t * cyphertext, uint8_t * plaintext) +{ + int outlen = 0, tmplen = 0; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_192_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_192_cbc\n"); + if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen)) + printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_192_cbc \n"); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_192_cbc_enc(uint8_t * key, uint8_t * iv, + int len, uint8_t * plaintext, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_192_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_EncryptUpdate + (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_192_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_192_cbc\n"); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_256_cbc_dec(uint8_t * key, uint8_t * iv, + int len, uint8_t * cyphertext, uint8_t * plaintext) +{ + int outlen = 0, tmplen = 0; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_256_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_256_cbc\n"); + if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen)) + printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_256_cbc %x,%x\n", outlen, + tmplen); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_256_cbc_enc(uint8_t * key, uint8_t * iv, + int len, uint8_t * plaintext, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_256_cbc\n"); + if (!EVP_CIPHER_CTX_set_padding(ctx, 0)) + printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n"); + if (!EVP_EncryptUpdate + (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_256_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_256_cbc\n"); + + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +static inline + int openssl_aes_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext, + int len, uint8_t * plaintext) +{ + int outlen = 0, tmplen = len, ret; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - key init\n"); + if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + + ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen); + if (0 < ret) { + tmplen += outlen; + } else { + //Authentication failed mismatched key, ADD or tag + tmplen = -1; + } + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext, + int len, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + //printf("ivl:%x addl:%x tagl:%x ptl:%x\n", iv_len, aad_len, tag_len, len); + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - init\n"); + if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n"); + if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n"); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_256_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext, + int len, uint8_t * plaintext) +{ + int outlen = 0, tmplen = len, ret; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_DecryptInit_ex - key init\n"); + if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n"); + if (!EVP_DecryptUpdate + (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len)) + printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n"); + ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen); + if (0 < ret) { + tmplen += outlen; + } else { + //Authentication failed mismatched key, ADD or tag + tmplen = -1; + } + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +static inline + int openssl_aes_256_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad, + int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext, + int len, uint8_t * cyphertext) +{ + int outlen, tmplen; + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL)) + printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n"); + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv)) + printf("\n ERROR!! EVP_EncryptInit_ex - init\n"); + if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len)) + printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n"); + if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len)) + printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n"); + if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen)) + printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n"); + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag)) + printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n"); + + EVP_CIPHER_CTX_free(ctx); + return tmplen; +} + +#endif /* AES_OSSL_HELPER_H_ */ diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c new file mode 100644 index 000000000..5dc898992 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c @@ -0,0 +1,143 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include <openssl/evp.h> + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + unsigned char *ct, unsigned char *dt) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, TEST_LEN)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt, *refdt; + unsigned char keyssl[32]; /* SSL takes both keys together */ + struct perf start, stop; + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_128_dec_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + /* Set up key for the SSL engine */ + for (i = 0; i < 16; i++) { + keyssl[i] = key1[i]; + keyssl[i + 16] = key2[i]; + } + + /* Encrypt and compare decrypted output */ + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt); + if (memcmp(dt, refdt, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + perf_stop(&stop); + printf("aes_xts_128_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt); + perf_stop(&stop); + printf("aes_xts_128_openssl_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c new file mode 100644 index 000000000..fdaa8a9bb --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c @@ -0,0 +1,125 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt; + uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11]; + uint8_t expkey1_dec[16 * 11], null_key[16 * 11]; + + printf("aes_xts_128_dec_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Decode perf test */ + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + } + + perf_stop(&stop); + + printf("aes_xts_128_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_128(key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(key2, expkey2_enc, null_key); + XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, + pt); + } + + perf_stop(&stop); + + printf("aes_xts_128_dec_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c new file mode 100644 index 000000000..69ae2e60e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c @@ -0,0 +1,144 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include <openssl/evp.h> + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts128_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *refct; + struct perf start, stop; + unsigned char keyssl[32]; /* SSL takes both keys together */ + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_128_enc_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == refct) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts128_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (i = 0; i < 16; i++) { + keyssl[i] = key1[i]; + keyssl[i + 16] = key2[i]; + } + + /* Encrypt and compare output */ + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + if (memcmp(ct, refct, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + perf_stop(&stop); + + printf("aes_xts_128_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + perf_stop(&stop); + + printf("aes_xts_128_openssl_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c new file mode 100644 index 000000000..166e46652 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c @@ -0,0 +1,123 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct; + uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11]; + uint8_t expkey1_dec[16 * 11], null_key[16 * 11]; + + printf("aes_xts_128_enc_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Encode perf test */ + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + } + + perf_stop(&stop); + + printf("aes_xts_128_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_128(key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(key2, expkey2_enc, null_key); + XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, + ct); + } + + perf_stop(&stop); + + printf("aes_xts_128_enc_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c new file mode 100644 index 000000000..27599f0ca --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c @@ -0,0 +1,116 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <aes_keyexp.h> +#include "xts_128_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + // Arrays for expanded keys, null_key is a dummy vector (decrypt key not + // needed for the tweak part of the decryption) + uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11]; + uint8_t expkey1_dec[16 * 11], null_key[16 * 11]; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + printf("Can't allocate ciphertext memory\n"); + return -1; + } + // Pre-expand keys (will only use the encryption ones here) + aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key); + + XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + // Vectors 1-10 and 15-19 are for the 128 bit code + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + printf("Can't allocate plaintext memory\n"); + return -1; + } + // Pre-expand keys for the decryption + aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key); + + // Note, encryption key is re-used for the tweak decryption step + XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + printf(" failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c new file mode 100644 index 000000000..4753d6778 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c @@ -0,0 +1,247 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include <aes_xts.h> +#include <aes_keyexp.h> + +#define TEST_LEN (1024*1024) +#define TEST_SIZE (4096) +#ifndef RANDOMS +# define RANDOMS 10 +#endif + +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int t, n; + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt; + + int align, size, min_size; + unsigned char *efence_pt; + unsigned char *efence_ct; + unsigned char *efence_dt; + + unsigned char *origin_pt; + unsigned char *origin_ct; + unsigned char *origin_dt; + + unsigned char key1_exp_enc[16 * 11], key1_exp_dec[16 * 11]; + unsigned char key2_exp_tw[16 * 11]; + int i; + + printf("aes_xts_128 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + if (memcmp(pt, dt, TEST_LEN)) { + printf("fail\n"); + return -1; + } + putchar('.'); + + // Do tests with random data, keys and message size + for (t = 0; t < RANDOMS; t++) { + n = rand() % (TEST_LEN); + if (n < 17) + continue; + + mk_rand_data(key1, key2, tinit, pt, n); + XTS_AES_128_enc(key2, key1, tinit, n, pt, ct); + XTS_AES_128_dec(key2, key1, tinit, n, ct, dt); + + if (memcmp(pt, dt, n)) { + printf("fail rand %d, size %d\n", t, n); + return -1; + } + putchar('.'); + fflush(0); + } + + // Run tests at end of buffer for Electric Fence + align = 1; + min_size = 16; + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + origin_pt = malloc(TEST_LEN); + origin_ct = malloc(TEST_LEN); + origin_dt = malloc(TEST_LEN); + if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) { + printf("malloc of testsize failed\n"); + return -1; + } + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + for (i = 0; i < 16 * 11; i++) { + key2_exp_tw[i] = rand(); + } + + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c new file mode 100644 index 000000000..065b84465 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c @@ -0,0 +1,271 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" +#include <stdlib.h> +#include <openssl/evp.h> + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 128 +#endif +#define TEST_LOOPS 128 +#define TEST_LEN (1024*1024) +#define LENGTH_SCAN (2*1024) + +/* Generates random data for keys, tweak and plaintext */ +void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p, + int n) +{ + int i; + for (i = 0; i < 16; i++) { + *k1++ = rand(); + *k2++ = rand(); + *k3++ = rand(); + } + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +/* Wrapper for OpenSSL EVP AES-XTS 128 encryption */ +static inline + int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv) + || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) { + printf("\n Error in openssl encoding of %d bytes\n", len); + return 1; + } + return 0; +} + +/* Wrapper for OpenSSL EVP AES-XTS 128 decryption */ +static inline + int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *ct, unsigned char *dt) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv) + || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len)) + || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) { + printf("\n Error in openssl decoding of %d bytes\n", len); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + + unsigned char key1[16], key2[16], tinit[16]; + unsigned char *pt, *ct, *dt, *refct, *refdt; + unsigned char keyssl[32]; /* SSL takes both keys together */ + unsigned int rand_len, t; + int i, j, k, ret; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + /* Allocate space for input and output buffers */ + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /**************************** LENGTH SCAN TEST *************************/ + printf("aes_xts_128_rand_ossl test, %d sets of various length: ", 2 * 1024); + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 16; k++) { + keyssl[k] = key1[k]; + keyssl[k + 16] = key2[k]; + } + + for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) { + + /* Encrypt using each method */ + XTS_AES_128_enc(key2, key1, tinit, i, pt, ct); + ret |= openssl_aes_128_xts_enc(ctx, keyssl, tinit, i, pt, refct); + + // Compare + for (ret = 0, j = 0; j < i && ret == 0; j++) { + if (ct[j] != refct[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_128_enc size=%d failed at byte %d!\n", i, j); + + /* Decrypt using each method */ + XTS_AES_128_dec(key2, key1, tinit, i, ct, dt); + ret |= openssl_aes_128_xts_dec(ctx, keyssl, tinit, i, refct, refdt); + + for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) { + if (dt[j] != refdt[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_128_dec size=%d failed at byte %d!\n", i, j); + if (0 == i % (LENGTH_SCAN / 16)) + printf("."); + fflush(0); + } + if (ret) + return -1; + printf("Pass\n"); + + /**************************** FIXED LENGTH TEST *************************/ + printf("aes_xts_128_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN); + + // Loop over the vectors + for (i = 0; i < TEST_LOOPS; i++) { + + mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 16; k++) { + keyssl[k] = key1[k]; + keyssl[k + 16] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct); + if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct)) + return -1; + + /* Carry out comparison of the calculated ciphertext with + * the reference + */ + for (j = 0; j < TEST_LEN; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_128_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt); + if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt)) + return -1; + + for (j = 0; j < TEST_LEN; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_128_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == i % (TEST_LOOPS / 16)) + printf("."); + fflush(0); + } + printf("Pass\n"); + + /**************************** RANDOM LENGTH TEST *************************/ + printf("aes_xts_128_rand_ossl test, %d sets of random lengths: ", RANDOMS); + + /* Run tests with random size */ + + for (t = 0; t < RANDOMS; t++) { + + rand_len = rand() % (TEST_LEN); + rand_len = rand_len < 16 ? 16 : rand_len; + mk_rand_data(key1, key2, tinit, pt, rand_len); + + /* Set up key for the SSL engine */ + for (k = 0; k < 16; k++) { + keyssl[k] = key1[k]; + keyssl[k + 16] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_128_enc(key2, key1, tinit, rand_len, pt, ct); + if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct)) + return -1; + + /* Carry out comparison of the calculated ciphertext with + * the reference + */ + for (j = 0; j < rand_len; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_128_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_128_dec(key2, key1, tinit, rand_len, ct, dt); + if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt)) + return -1; + + for (j = 0; j < rand_len; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_128_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == t % (RANDOMS / 16)) + printf("."); + fflush(0); + } + + EVP_CIPHER_CTX_free(ctx); + + printf("Pass\n"); + + printf("aes_xts_128_rand_ossl: All tests passed\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c new file mode 100644 index 000000000..5dd57e33c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c @@ -0,0 +1,106 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include "xts_128_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return -1; + } + + XTS_AES_128_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + // Vectors 1-10 and 15-19 are for the 128 bit code + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate plaintext memory\n"); + return -1; + } + + XTS_AES_128_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + // Carry out comparison of the calculated ciphertext with + // the reference + printf("\nXTS_AES_128_enc: Vector %d: ", + i < 9 ? i + 1 : i + 6); + + printf(" failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h new file mode 100644 index 000000000..fce792dc7 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h @@ -0,0 +1,1691 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" + +#define NVEC 14 + +// struct to hold pointers to the key, plaintext and ciphertext vectors +struct xts_vector { + uint64_t ptlen; // length of our plaintext + uint8_t *key1; // dimension 16 for 128 bit aes + uint8_t *key2; // dimension 16 for 128 bit aes + uint8_t *TW; // dimension 16 for both 128 and 256 bit + uint8_t *PTX; // min. dimension 16 + uint8_t *CTX; // same dimension as PTX +}; + +/* Define our test vectors statically here. Test vectors are from the standard: + * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented + * Storage Devices" + * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450 + * + * Vector 1 + * Key1 00000000000000000000000000000000 + * Key2 00000000000000000000000000000000 + * Data Unit Sequence number 0 + * PTX 0000000000000000000000000000000000000000000000000000000000000000 /128bit + * TWK 66e94bd4ef8a2c3b884cfa59ca342b2eccd297a8df1559761099f4b39469565c + * CTX 917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e + * Plaintext length (bytes): 32 + */ + +static uint8_t v1_key1[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_key2[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_TW[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_PTX[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v1_CTX[32] = { + 0x91, 0x7c, 0xf6, 0x9e, 0xbd, 0x68, 0xb2, 0xec, + 0x9b, 0x9f, 0xe9, 0xa3, 0xea, 0xdd, 0xa6, 0x92, + 0xcd, 0x43, 0xd2, 0xf5, 0x95, 0x98, 0xed, 0x85, + 0x8c, 0x02, 0xc2, 0x65, 0x2f, 0xbf, 0x92, 0x2e +}; + +/* + * Vector 2 + * Key1 11111111111111111111111111111111 + * Key2 22222222222222222222222222222222 + * Data Unit Sequence number 3333333333 + * PTX 4444444444444444444444444444444444444444444444444444444444444444 + * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d + * CTX c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0 + * Plaintext length (bytes): 32 + */ + +static uint8_t v2_key1[16] = { + 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, + 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 +}; + +static uint8_t v2_key2[16] = { + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22 +}; + +static uint8_t v2_TW[16] = { + 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v2_PTX[32] = { + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44 +}; + +static uint8_t v2_CTX[32] = { + 0xc4, 0x54, 0x18, 0x5e, 0x6a, 0x16, 0x93, 0x6e, + 0x39, 0x33, 0x40, 0x38, 0xac, 0xef, 0x83, 0x8b, + 0xfb, 0x18, 0x6f, 0xff, 0x74, 0x80, 0xad, 0xc4, + 0x28, 0x93, 0x82, 0xec, 0xd6, 0xd3, 0x94, 0xf0 +}; + +/* + * Vector 3 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 22222222222222222222222222222222 + * Data Unit Sequence number 3333333333 + * PTX 4444444444444444444444444444444444444444444444444444444444444444 + * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d + * CTX af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89 + * Plaintext length (bytes): 32 + */ + +static uint8_t v3_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v3_key2[16] = { + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22 +}; + +static uint8_t v3_TW[16] = { + 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v3_PTX[32] = { + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, + 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44 +}; + +static uint8_t v3_CTX[32] = { + 0xaf, 0x85, 0x33, 0x6b, 0x59, 0x7a, 0xfc, 0x1a, + 0x90, 0x0b, 0x2e, 0xb2, 0x1e, 0xc9, 0x49, 0xd2, + 0x92, 0xdf, 0x4c, 0x04, 0x7e, 0x0b, 0x21, 0x53, + 0x21, 0x86, 0xa5, 0x97, 0x1a, 0x22, 0x7a, 0x89 +}; + +/* + * Vector 4 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence number 0 + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c + * CTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412 + * CTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce + * CTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265 + * CTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8 + * CTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434 + * CTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c + * CTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e + * CTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc + * CTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3 + * CTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344 + * CTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd + * CTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752 + * CTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e + * CTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d + * CTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568 + * Plaintext length (bytes): 512 + */ +static uint8_t v4_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v4_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v4_TW[16] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v4_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v4_CTX[512] = { + 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76, + 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2, + 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25, + 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c, + 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f, + 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00, + 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad, + 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12, + 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5, + 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5, + 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc, + 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce, + 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4, + 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84, + 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a, + 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65, + 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89, + 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51, + 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15, + 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8, + 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed, + 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91, + 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e, + 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34, + 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b, + 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5, + 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4, + 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c, + 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd, + 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3, + 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f, + 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e, + 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91, + 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19, + 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1, + 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc, + 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed, + 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde, + 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98, + 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3, + 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca, + 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6, + 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc, + 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44, + 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0, + 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95, + 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4, + 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd, + 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13, + 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7, + 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a, + 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52, + 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a, + 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38, + 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e, + 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e, + 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad, + 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8, + 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c, + 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d, + 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f, + 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2, + 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea, + 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68 +}; + +/* + * Vector 5 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number 01 + * PTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c + * PTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412 + * PTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce + * PTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265 + * PTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8 + * PTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434 + * PTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c + * PTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e + * PTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc + * PTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3 + * PTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344 + * PTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd + * PTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752 + * PTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e + * PTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d + * PTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568 + * CTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5 + * CTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb + * CTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f + * CTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501 + * CTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99 + * CTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407 + * CTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3 + * CTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb + * CTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd + * CTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114 + * CTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed + * CTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62 + * CTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284 + * CTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32 + * CTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c + * CTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd + * Plaintext length (bytes): 512 + */ + +static uint8_t v5_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v5_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v5_TW[16] = { + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v5_PTX[512] = { + 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76, + 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2, + 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25, + 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c, + 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f, + 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00, + 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad, + 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12, + 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5, + 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5, + 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc, + 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce, + 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4, + 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84, + 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a, + 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65, + 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89, + 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51, + 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15, + 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8, + 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed, + 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91, + 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e, + 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34, + 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b, + 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5, + 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4, + 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c, + 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd, + 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3, + 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f, + 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e, + 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91, + 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19, + 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1, + 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc, + 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed, + 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde, + 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98, + 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3, + 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca, + 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6, + 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc, + 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44, + 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0, + 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95, + 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4, + 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd, + 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13, + 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7, + 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a, + 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52, + 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a, + 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38, + 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e, + 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e, + 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad, + 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8, + 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c, + 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d, + 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f, + 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2, + 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea, + 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68 +}; + +static uint8_t v5_CTX[512] = { + 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe, + 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f, + 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60, + 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5, + 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d, + 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce, + 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b, + 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb, + 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49, + 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3, + 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28, + 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f, + 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca, + 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2, + 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa, + 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01, + 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43, + 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d, + 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58, + 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99, + 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3, + 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f, + 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e, + 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07, + 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a, + 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52, + 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c, + 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3, + 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16, + 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b, + 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6, + 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb, + 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5, + 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0, + 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44, + 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd, + 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb, + 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1, + 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf, + 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14, + 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7, + 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d, + 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c, + 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed, + 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7, + 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97, + 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32, + 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62, + 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75, + 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce, + 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04, + 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84, + 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43, + 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95, + 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4, + 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32, + 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab, + 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2, + 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d, + 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c, + 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4, + 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d, + 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a, + 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd +}; + +/* + * Vector 6 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number 02 + * PTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5 + * PTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb + * PTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f + * PTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501 + * PTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99 + * PTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407 + * PTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3 + * PTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb + * PTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd + * PTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114 + * PTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed + * PTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62 + * PTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284 + * PTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32 + * PTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c + * PTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd + * CTX fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e + * CTX 18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9 + * CTX a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62 + * CTX 344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7 + * CTX 946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb3 + * CTX 86fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec + * CTX 26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791 + * CTX a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f + * CTX 15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec54 + * CTX 5365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c + * CTX 874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b151105 + * CTX 19655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159 + * CTX 625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab + * CTX 584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60 + * CTX 401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a42 + * CTX 31864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6 + * Plaintext length (bytes): 512 + */ +static uint8_t v6_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v6_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v6_TW[16] = { + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v6_PTX[512] = { + + 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe, + 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f, + 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60, + 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5, + 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d, + 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce, + 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b, + 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb, + 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49, + 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3, + 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28, + 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f, + 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca, + 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2, + 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa, + 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01, + 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43, + 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d, + 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58, + 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99, + 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3, + 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f, + 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e, + 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07, + 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a, + 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52, + 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c, + 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3, + 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16, + 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b, + 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6, + 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb, + 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5, + 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0, + 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44, + 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd, + 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb, + 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1, + 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf, + 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14, + 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7, + 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d, + 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c, + 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed, + 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7, + 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97, + 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32, + 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62, + 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75, + 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce, + 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04, + 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84, + 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43, + 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95, + 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4, + 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32, + 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab, + 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2, + 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d, + 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c, + 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4, + 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d, + 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a, + 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd +}; + +static uint8_t v6_CTX[512] = { + + 0xfa, 0x76, 0x2a, 0x36, 0x80, 0xb7, 0x60, 0x07, + 0x92, 0x8e, 0xd4, 0xa4, 0xf4, 0x9a, 0x94, 0x56, + 0x03, 0x1b, 0x70, 0x47, 0x82, 0xe6, 0x5e, 0x16, + 0xce, 0xcb, 0x54, 0xed, 0x7d, 0x01, 0x7b, 0x5e, + 0x18, 0xab, 0xd6, 0x7b, 0x33, 0x8e, 0x81, 0x07, + 0x8f, 0x21, 0xed, 0xb7, 0x86, 0x8d, 0x90, 0x1e, + 0xbe, 0x9c, 0x73, 0x1a, 0x7c, 0x18, 0xb5, 0xe6, + 0xde, 0xc1, 0xd6, 0xa7, 0x2e, 0x07, 0x8a, 0xc9, + 0xa4, 0x26, 0x2f, 0x86, 0x0b, 0xee, 0xfa, 0x14, + 0xf4, 0xe8, 0x21, 0x01, 0x82, 0x72, 0xe4, 0x11, + 0xa9, 0x51, 0x50, 0x2b, 0x6e, 0x79, 0x06, 0x6e, + 0x84, 0x25, 0x2c, 0x33, 0x46, 0xf3, 0xaa, 0x62, + 0x34, 0x43, 0x51, 0xa2, 0x91, 0xd4, 0xbe, 0xdc, + 0x7a, 0x07, 0x61, 0x8b, 0xde, 0xa2, 0xaf, 0x63, + 0x14, 0x5c, 0xc7, 0xa4, 0xb8, 0xd4, 0x07, 0x06, + 0x91, 0xae, 0x89, 0x0c, 0xd6, 0x57, 0x33, 0xe7, + 0x94, 0x6e, 0x90, 0x21, 0xa1, 0xdf, 0xfc, 0x4c, + 0x59, 0xf1, 0x59, 0x42, 0x5e, 0xe6, 0xd5, 0x0c, + 0xa9, 0xb1, 0x35, 0xfa, 0x61, 0x62, 0xce, 0xa1, + 0x8a, 0x93, 0x98, 0x38, 0xdc, 0x00, 0x0f, 0xb3, + 0x86, 0xfa, 0xd0, 0x86, 0xac, 0xce, 0x5a, 0xc0, + 0x7c, 0xb2, 0xec, 0xe7, 0xfd, 0x58, 0x0b, 0x00, + 0xcf, 0xa5, 0xe9, 0x85, 0x89, 0x63, 0x1d, 0xc2, + 0x5e, 0x8e, 0x2a, 0x3d, 0xaf, 0x2f, 0xfd, 0xec, + 0x26, 0x53, 0x16, 0x59, 0x91, 0x2c, 0x9d, 0x8f, + 0x7a, 0x15, 0xe5, 0x86, 0x5e, 0xa8, 0xfb, 0x58, + 0x16, 0xd6, 0x20, 0x70, 0x52, 0xbd, 0x71, 0x28, + 0xcd, 0x74, 0x3c, 0x12, 0xc8, 0x11, 0x87, 0x91, + 0xa4, 0x73, 0x68, 0x11, 0x93, 0x5e, 0xb9, 0x82, + 0xa5, 0x32, 0x34, 0x9e, 0x31, 0xdd, 0x40, 0x1e, + 0x0b, 0x66, 0x0a, 0x56, 0x8c, 0xb1, 0xa4, 0x71, + 0x1f, 0x55, 0x2f, 0x55, 0xde, 0xd5, 0x9f, 0x1f, + 0x15, 0xbf, 0x71, 0x96, 0xb3, 0xca, 0x12, 0xa9, + 0x1e, 0x48, 0x8e, 0xf5, 0x9d, 0x64, 0xf3, 0xa0, + 0x2b, 0xf4, 0x52, 0x39, 0x49, 0x9a, 0xc6, 0x17, + 0x6a, 0xe3, 0x21, 0xc4, 0xa2, 0x11, 0xec, 0x54, + 0x53, 0x65, 0x97, 0x1c, 0x5d, 0x3f, 0x4f, 0x09, + 0xd4, 0xeb, 0x13, 0x9b, 0xfd, 0xf2, 0x07, 0x3d, + 0x33, 0x18, 0x0b, 0x21, 0x00, 0x2b, 0x65, 0xcc, + 0x98, 0x65, 0xe7, 0x6c, 0xb2, 0x4c, 0xd9, 0x2c, + 0x87, 0x4c, 0x24, 0xc1, 0x83, 0x50, 0x39, 0x9a, + 0x93, 0x6a, 0xb3, 0x63, 0x70, 0x79, 0x29, 0x5d, + 0x76, 0xc4, 0x17, 0x77, 0x6b, 0x94, 0xef, 0xce, + 0x3a, 0x0e, 0xf7, 0x20, 0x6b, 0x15, 0x11, 0x05, + 0x19, 0x65, 0x5c, 0x95, 0x6c, 0xbd, 0x8b, 0x24, + 0x89, 0x40, 0x5e, 0xe2, 0xb0, 0x9a, 0x6b, 0x6e, + 0xeb, 0xe0, 0xc5, 0x37, 0x90, 0xa1, 0x2a, 0x89, + 0x98, 0x37, 0x8b, 0x33, 0xa5, 0xb7, 0x11, 0x59, + 0x62, 0x5f, 0x4b, 0xa4, 0x9d, 0x2a, 0x2f, 0xdb, + 0xa5, 0x9f, 0xbf, 0x08, 0x97, 0xbc, 0x7a, 0xab, + 0xd8, 0xd7, 0x07, 0xdc, 0x14, 0x0a, 0x80, 0xf0, + 0xf3, 0x09, 0xf8, 0x35, 0xd3, 0xda, 0x54, 0xab, + 0x58, 0x4e, 0x50, 0x1d, 0xfa, 0x0e, 0xe9, 0x77, + 0xfe, 0xc5, 0x43, 0xf7, 0x41, 0x86, 0xa8, 0x02, + 0xb9, 0xa3, 0x7a, 0xdb, 0x3e, 0x82, 0x91, 0xec, + 0xa0, 0x4d, 0x66, 0x52, 0x0d, 0x22, 0x9e, 0x60, + 0x40, 0x1e, 0x72, 0x82, 0xbe, 0xf4, 0x86, 0xae, + 0x05, 0x9a, 0xa7, 0x06, 0x96, 0xe0, 0xe3, 0x05, + 0xd7, 0x77, 0x14, 0x0a, 0x7a, 0x88, 0x3e, 0xcd, + 0xcb, 0x69, 0xb9, 0xff, 0x93, 0x8e, 0x8a, 0x42, + 0x31, 0x86, 0x4c, 0x69, 0xca, 0x2c, 0x20, 0x43, + 0xbe, 0xd0, 0x07, 0xff, 0x3e, 0x60, 0x5e, 0x01, + 0x4b, 0xcf, 0x51, 0x81, 0x38, 0xdc, 0x3a, 0x25, + 0xc5, 0xe2, 0x36, 0x17, 0x1a, 0x2d, 0x01, 0xd6 +}; + +/* + * Vector 7 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number fd + * PTX 8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45d + * PTX aad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731a + * PTX ad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3 + * PTX 693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46 + * PTX cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db6026 + * PTX 7996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18 + * PTX dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f + * PTX 6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e0338 + * PTX 33fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641 + * PTX d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd80 + * PTX 23c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b + * PTX 637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a + * PTX 5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a80 + * PTX 5c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca72 + * PTX 1b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292 + * PTX d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888 + * CTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3 + * CTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22 + * CTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce + * CTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c + * CTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8 + * CTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b + * CTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82 + * CTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969 + * CTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5 + * CTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b + * CTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec + * CTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d + * CTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef + * CTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280 + * CTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3 + * CTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637 + * Plaintext length (bytes): 512 + */ +static uint8_t v7_key1[16] = { + + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v7_key2[16] = { + + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v7_TW[16] = { + + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v7_PTX[512] = { + + 0x8e, 0x41, 0xb7, 0x8c, 0x39, 0x0b, 0x5a, 0xf9, + 0xd7, 0x58, 0xbb, 0x21, 0x4a, 0x67, 0xe9, 0xf6, + 0xbf, 0x77, 0x27, 0xb0, 0x9a, 0xc6, 0x12, 0x40, + 0x84, 0xc3, 0x76, 0x11, 0x39, 0x8f, 0xa4, 0x5d, + 0xaa, 0xd9, 0x48, 0x68, 0x60, 0x0e, 0xd3, 0x91, + 0xfb, 0x1a, 0xcd, 0x48, 0x57, 0xa9, 0x5b, 0x46, + 0x6e, 0x62, 0xef, 0x9f, 0x4b, 0x37, 0x72, 0x44, + 0xd1, 0xc1, 0x52, 0xe7, 0xb3, 0x0d, 0x73, 0x1a, + 0xad, 0x30, 0xc7, 0x16, 0xd2, 0x14, 0xb7, 0x07, + 0xae, 0xd9, 0x9e, 0xb5, 0xb5, 0xe5, 0x80, 0xb3, + 0xe8, 0x87, 0xcf, 0x74, 0x97, 0x46, 0x56, 0x51, + 0xd4, 0xb6, 0x0e, 0x60, 0x42, 0x05, 0x1d, 0xa3, + 0x69, 0x3c, 0x3b, 0x78, 0xc1, 0x44, 0x89, 0x54, + 0x3b, 0xe8, 0xb6, 0xad, 0x0b, 0xa6, 0x29, 0x56, + 0x5b, 0xba, 0x20, 0x23, 0x13, 0xba, 0x7b, 0x0d, + 0x0c, 0x94, 0xa3, 0x25, 0x2b, 0x67, 0x6f, 0x46, + 0xcc, 0x02, 0xce, 0x0f, 0x8a, 0x7d, 0x34, 0xc0, + 0xed, 0x22, 0x91, 0x29, 0x67, 0x3c, 0x1f, 0x61, + 0xae, 0xd5, 0x79, 0xd0, 0x8a, 0x92, 0x03, 0xa2, + 0x5a, 0xac, 0x3a, 0x77, 0xe9, 0xdb, 0x60, 0x26, + 0x79, 0x96, 0xdb, 0x38, 0xdf, 0x63, 0x73, 0x56, + 0xd9, 0xdc, 0xd1, 0x63, 0x2e, 0x36, 0x99, 0x39, + 0xf2, 0xa2, 0x9d, 0x89, 0x34, 0x5c, 0x66, 0xe0, + 0x50, 0x66, 0xf1, 0xa3, 0x67, 0x7a, 0xef, 0x18, + 0xde, 0xa4, 0x11, 0x3f, 0xae, 0xb6, 0x29, 0xe4, + 0x67, 0x21, 0xa6, 0x6d, 0x0a, 0x7e, 0x78, 0x5d, + 0x3e, 0x29, 0xaf, 0x25, 0x94, 0xeb, 0x67, 0xdf, + 0xa9, 0x82, 0xaf, 0xfe, 0x0a, 0xac, 0x05, 0x8f, + 0x6e, 0x15, 0x86, 0x42, 0x69, 0xb1, 0x35, 0x41, + 0x82, 0x61, 0xfc, 0x3a, 0xfb, 0x08, 0x94, 0x72, + 0xcf, 0x68, 0xc4, 0x5d, 0xd7, 0xf2, 0x31, 0xc6, + 0x24, 0x9b, 0xa0, 0x25, 0x5e, 0x1e, 0x03, 0x38, + 0x33, 0xfc, 0x4d, 0x00, 0xa3, 0xfe, 0x02, 0x13, + 0x2d, 0x7b, 0xc3, 0x87, 0x36, 0x14, 0xb8, 0xae, + 0xe3, 0x42, 0x73, 0x58, 0x1e, 0xa0, 0x32, 0x5c, + 0x81, 0xf0, 0x27, 0x0a, 0xff, 0xa1, 0x36, 0x41, + 0xd0, 0x52, 0xd3, 0x6f, 0x07, 0x57, 0xd4, 0x84, + 0x01, 0x43, 0x54, 0xd0, 0x2d, 0x68, 0x83, 0xca, + 0x15, 0xc2, 0x4d, 0x8c, 0x39, 0x56, 0xb1, 0xbd, + 0x02, 0x7b, 0xcf, 0x41, 0xf1, 0x51, 0xfd, 0x80, + 0x23, 0xc5, 0x34, 0x0e, 0x56, 0x06, 0xf3, 0x7e, + 0x90, 0xfd, 0xb8, 0x7c, 0x86, 0xfb, 0x4f, 0xa6, + 0x34, 0xb3, 0x71, 0x8a, 0x30, 0xba, 0xce, 0x06, + 0xa6, 0x6e, 0xaf, 0x8f, 0x63, 0xc4, 0xaa, 0x3b, + 0x63, 0x78, 0x26, 0xa8, 0x7f, 0xe8, 0xcf, 0xa4, + 0x42, 0x82, 0xe9, 0x2c, 0xb1, 0x61, 0x5a, 0xf3, + 0xa2, 0x8e, 0x53, 0xbc, 0x74, 0xc7, 0xcb, 0xa1, + 0xa0, 0x97, 0x7b, 0xe9, 0x06, 0x5d, 0x0c, 0x1a, + 0x5d, 0xec, 0x6c, 0x54, 0xae, 0x38, 0xd3, 0x7f, + 0x37, 0xaa, 0x35, 0x28, 0x3e, 0x04, 0x8e, 0x55, + 0x30, 0xa8, 0x5c, 0x4e, 0x7a, 0x29, 0xd7, 0xb9, + 0x2e, 0xc0, 0xc3, 0x16, 0x9c, 0xdf, 0x2a, 0x80, + 0x5c, 0x76, 0x04, 0xbc, 0xe6, 0x00, 0x49, 0xb9, + 0xfb, 0x7b, 0x8e, 0xaa, 0xc1, 0x0f, 0x51, 0xae, + 0x23, 0x79, 0x4c, 0xeb, 0xa6, 0x8b, 0xb5, 0x81, + 0x12, 0xe2, 0x93, 0xb9, 0xb6, 0x92, 0xca, 0x72, + 0x1b, 0x37, 0xc6, 0x62, 0xf8, 0x57, 0x4e, 0xd4, + 0xdb, 0xa6, 0xf8, 0x8e, 0x17, 0x08, 0x81, 0xc8, + 0x2c, 0xdd, 0xc1, 0x03, 0x4a, 0x0c, 0xa7, 0xe2, + 0x84, 0xbf, 0x09, 0x62, 0xb6, 0xb2, 0x62, 0x92, + 0xd8, 0x36, 0xfa, 0x9f, 0x73, 0xc1, 0xac, 0x77, + 0x0e, 0xef, 0x0f, 0x2d, 0x3a, 0x1e, 0xaf, 0x61, + 0xd3, 0xe0, 0x35, 0x55, 0xfd, 0x42, 0x4e, 0xed, + 0xd6, 0x7e, 0x18, 0xa1, 0x80, 0x94, 0xf8, 0x88 +}; + +static uint8_t v7_CTX[512] = { + + 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e, + 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a, + 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9, + 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3, + 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07, + 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3, + 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08, + 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22, + 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c, + 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f, + 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd, + 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce, + 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f, + 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5, + 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc, + 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c, + 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3, + 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b, + 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec, + 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8, + 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee, + 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b, + 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3, + 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b, + 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd, + 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c, + 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2, + 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82, + 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d, + 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd, + 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6, + 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69, + 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e, + 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7, + 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53, + 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5, + 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58, + 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32, + 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a, + 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b, + 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17, + 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45, + 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54, + 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec, + 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0, + 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39, + 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66, + 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d, + 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83, + 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31, + 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03, + 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef, + 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18, + 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1, + 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65, + 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80, + 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23, + 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f, + 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49, + 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3, + 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78, + 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1, + 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc, + 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37 +}; + +/* + * Vector 8 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number fe + * PTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3 + * PTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22 + * PTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce + * PTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c + * PTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8 + * PTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b + * PTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82 + * PTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969 + * PTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5 + * PTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b + * PTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec + * PTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d + * PTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef + * PTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280 + * PTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3 + * PTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637 + * CTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395 + * CTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae + * CTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94 + * CTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0 + * CTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd + * CTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d + * CTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792 + * CTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e + * CTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18 + * CTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b + * CTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb + * CTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a + * CTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d + * CTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4 + * CTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1 + * CTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a + * Plaintext length (bytes): 512 + */ +static uint8_t v8_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v8_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v8_TW[16] = { + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v8_PTX[512] = { + 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e, + 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a, + 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9, + 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3, + 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07, + 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3, + 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08, + 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22, + 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c, + 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f, + 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd, + 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce, + 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f, + 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5, + 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc, + 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c, + 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3, + 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b, + 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec, + 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8, + 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee, + 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b, + 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3, + 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b, + 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd, + 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c, + 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2, + 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82, + 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d, + 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd, + 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6, + 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69, + 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e, + 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7, + 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53, + 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5, + 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58, + 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32, + 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a, + 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b, + 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17, + 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45, + 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54, + 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec, + 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0, + 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39, + 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66, + 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d, + 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83, + 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31, + 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03, + 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef, + 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18, + 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1, + 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65, + 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80, + 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23, + 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f, + 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49, + 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3, + 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78, + 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1, + 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc, + 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37 +}; + +static uint8_t v8_CTX[512] = { + 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59, + 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda, + 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44, + 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95, + 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1, + 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2, + 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e, + 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae, + 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e, + 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a, + 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27, + 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94, + 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c, + 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32, + 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18, + 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0, + 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d, + 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d, + 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf, + 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd, + 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1, + 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72, + 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb, + 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d, + 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6, + 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b, + 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58, + 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92, + 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf, + 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a, + 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29, + 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e, + 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28, + 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77, + 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87, + 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18, + 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1, + 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd, + 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67, + 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b, + 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde, + 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75, + 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a, + 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb, + 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32, + 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11, + 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65, + 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a, + 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca, + 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24, + 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e, + 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d, + 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05, + 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06, + 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9, + 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4, + 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11, + 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c, + 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85, + 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1, + 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11, + 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c, + 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef, + 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a +}; + +/* + * Vector 9 + * Key1 27182818284590452353602874713526 + * Key2 31415926535897932384626433832795 + * Data Unit Sequence Number ff + * PTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395 + * PTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae + * PTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94 + * PTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0 + * PTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd + * PTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d + * PTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792 + * PTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e + * PTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18 + * PTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b + * PTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb + * PTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a + * PTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d + * PTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4 + * PTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1 + * PTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a + * CTX 3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553 + * CTX 193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07a + * CTX f073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca + * CTX 12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58 + * CTX b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf32542 + * CTX 3bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a + * CTX 2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f9 + * CTX 4ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46 + * CTX d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c59037 + * CTX 6e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866f + * CTX e9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac7734 + * CTX 21e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82 + * CTX 464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd3 + * CTX 8675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcf + * CTX b3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a8 + * CTX 4b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd + * Plaintext length (bytes): 512 + */ +static uint8_t v9_key1[16] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26 +}; + +static uint8_t v9_key2[16] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95 +}; + +static uint8_t v9_TW[16] = { + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v9_PTX[512] = { + 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59, + 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda, + 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44, + 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95, + 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1, + 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2, + 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e, + 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae, + 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e, + 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a, + 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27, + 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94, + 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c, + 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32, + 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18, + 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0, + 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d, + 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d, + 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf, + 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd, + 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1, + 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72, + 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb, + 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d, + 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6, + 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b, + 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58, + 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92, + 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf, + 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a, + 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29, + 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e, + 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28, + 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77, + 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87, + 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18, + 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1, + 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd, + 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67, + 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b, + 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde, + 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75, + 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a, + 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb, + 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32, + 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11, + 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65, + 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a, + 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca, + 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24, + 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e, + 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d, + 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05, + 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06, + 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9, + 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4, + 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11, + 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c, + 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85, + 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1, + 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11, + 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c, + 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef, + 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a +}; + +static uint8_t v9_CTX[512] = { + 0x32, 0x60, 0xae, 0x8d, 0xad, 0x1f, 0x4a, 0x32, + 0xc5, 0xca, 0xfe, 0x3a, 0xb0, 0xeb, 0x95, 0x54, + 0x9d, 0x46, 0x1a, 0x67, 0xce, 0xb9, 0xe5, 0xaa, + 0x2d, 0x3a, 0xfb, 0x62, 0xde, 0xce, 0x05, 0x53, + 0x19, 0x3b, 0xa5, 0x0c, 0x75, 0xbe, 0x25, 0x1e, + 0x08, 0xd1, 0xd0, 0x8f, 0x10, 0x88, 0x57, 0x6c, + 0x7e, 0xfd, 0xfa, 0xaf, 0x3f, 0x45, 0x95, 0x59, + 0x57, 0x1e, 0x12, 0x51, 0x17, 0x53, 0xb0, 0x7a, + 0xf0, 0x73, 0xf3, 0x5d, 0xa0, 0x6a, 0xf0, 0xce, + 0x0b, 0xbf, 0x6b, 0x8f, 0x5c, 0xcc, 0x5c, 0xea, + 0x50, 0x0e, 0xc1, 0xb2, 0x11, 0xbd, 0x51, 0xf6, + 0x3b, 0x60, 0x6b, 0xf6, 0x52, 0x87, 0x96, 0xca, + 0x12, 0x17, 0x3b, 0xa3, 0x9b, 0x89, 0x35, 0xee, + 0x44, 0xcc, 0xce, 0x64, 0x6f, 0x90, 0xa4, 0x5b, + 0xf9, 0xcc, 0xc5, 0x67, 0xf0, 0xac, 0xe1, 0x3d, + 0xc2, 0xd5, 0x3e, 0xbe, 0xed, 0xc8, 0x1f, 0x58, + 0xb2, 0xe4, 0x11, 0x79, 0xdd, 0xdf, 0x0d, 0x5a, + 0x5c, 0x42, 0xf5, 0xd8, 0x50, 0x6c, 0x1a, 0x5d, + 0x2f, 0x8f, 0x59, 0xf3, 0xea, 0x87, 0x3c, 0xbc, + 0xd0, 0xee, 0xc1, 0x9a, 0xcb, 0xf3, 0x25, 0x42, + 0x3b, 0xd3, 0xdc, 0xb8, 0xc2, 0xb1, 0xbf, 0x1d, + 0x1e, 0xae, 0xd0, 0xeb, 0xa7, 0xf0, 0x69, 0x8e, + 0x43, 0x14, 0xfb, 0xeb, 0x2f, 0x15, 0x66, 0xd1, + 0xb9, 0x25, 0x30, 0x08, 0xcb, 0xcc, 0xf4, 0x5a, + 0x2b, 0x0d, 0x9c, 0x5c, 0x9c, 0x21, 0x47, 0x4f, + 0x40, 0x76, 0xe0, 0x2b, 0xe2, 0x60, 0x50, 0xb9, + 0x9d, 0xee, 0x4f, 0xd6, 0x8a, 0x4c, 0xf8, 0x90, + 0xe4, 0x96, 0xe4, 0xfc, 0xae, 0x7b, 0x70, 0xf9, + 0x4e, 0xa5, 0xa9, 0x06, 0x2d, 0xa0, 0xda, 0xeb, + 0xa1, 0x99, 0x3d, 0x2c, 0xcd, 0x1d, 0xd3, 0xc2, + 0x44, 0xb8, 0x42, 0x88, 0x01, 0x49, 0x5a, 0x58, + 0xb2, 0x16, 0x54, 0x7e, 0x7e, 0x84, 0x7c, 0x46, + 0xd1, 0xd7, 0x56, 0x37, 0x7b, 0x62, 0x42, 0xd2, + 0xe5, 0xfb, 0x83, 0xbf, 0x75, 0x2b, 0x54, 0xe0, + 0xdf, 0x71, 0xe8, 0x89, 0xf3, 0xa2, 0xbb, 0x0f, + 0x4c, 0x10, 0x80, 0x5b, 0xf3, 0xc5, 0x90, 0x37, + 0x6e, 0x3c, 0x24, 0xe2, 0x2f, 0xf5, 0x7f, 0x7f, + 0xa9, 0x65, 0x57, 0x73, 0x75, 0x32, 0x5c, 0xea, + 0x5d, 0x92, 0x0d, 0xb9, 0x4b, 0x9c, 0x33, 0x6b, + 0x45, 0x5f, 0x6e, 0x89, 0x4c, 0x01, 0x86, 0x6f, + 0xe9, 0xfb, 0xb8, 0xc8, 0xd3, 0xf7, 0x0a, 0x29, + 0x57, 0x28, 0x5f, 0x6d, 0xfb, 0x5d, 0xcd, 0x8c, + 0xbf, 0x54, 0x78, 0x2f, 0x8f, 0xe7, 0x76, 0x6d, + 0x47, 0x23, 0x81, 0x99, 0x13, 0xac, 0x77, 0x34, + 0x21, 0xe3, 0xa3, 0x10, 0x95, 0x86, 0x6b, 0xad, + 0x22, 0xc8, 0x6a, 0x60, 0x36, 0xb2, 0x51, 0x8b, + 0x20, 0x59, 0xb4, 0x22, 0x9d, 0x18, 0xc8, 0xc2, + 0xcc, 0xbd, 0xf9, 0x06, 0xc6, 0xcc, 0x6e, 0x82, + 0x46, 0x4e, 0xe5, 0x7b, 0xdd, 0xb0, 0xbe, 0xbc, + 0xb1, 0xdc, 0x64, 0x53, 0x25, 0xbf, 0xb3, 0xe6, + 0x65, 0xef, 0x72, 0x51, 0x08, 0x2c, 0x88, 0xeb, + 0xb1, 0xcf, 0x20, 0x3b, 0xd7, 0x79, 0xfd, 0xd3, + 0x86, 0x75, 0x71, 0x3c, 0x8d, 0xaa, 0xdd, 0x17, + 0xe1, 0xca, 0xbe, 0xe4, 0x32, 0xb0, 0x97, 0x87, + 0xb6, 0xdd, 0xf3, 0x30, 0x4e, 0x38, 0xb7, 0x31, + 0xb4, 0x5d, 0xf5, 0xdf, 0x51, 0xb7, 0x8f, 0xcf, + 0xb3, 0xd3, 0x24, 0x66, 0x02, 0x8d, 0x0b, 0xa3, + 0x65, 0x55, 0xe7, 0xe1, 0x1a, 0xb0, 0xee, 0x06, + 0x66, 0x06, 0x1d, 0x16, 0x45, 0xd9, 0x62, 0x44, + 0x4b, 0xc4, 0x7a, 0x38, 0x18, 0x89, 0x30, 0xa8, + 0x4b, 0x4d, 0x56, 0x13, 0x95, 0xc7, 0x3c, 0x08, + 0x70, 0x21, 0x92, 0x7c, 0xa6, 0x38, 0xb7, 0xaf, + 0xc8, 0xa8, 0x67, 0x9c, 0xcb, 0x84, 0xc2, 0x65, + 0x55, 0x44, 0x0e, 0xc7, 0xf1, 0x04, 0x45, 0xcd +}; + +/* + * Vector 15 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f10 + * CTX 6c1625db4671522d3d7599601de7ca09ed + * Plaintext length (bytes): 17 + */ + +static uint8_t v15_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v15_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v15_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v15_PTX[17] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10 +}; + +static uint8_t v15_CTX[17] = { + 0x6c, 0x16, 0x25, 0xdb, 0x46, 0x71, 0x52, 0x2d, + 0x3d, 0x75, 0x99, 0x60, 0x1d, 0xe7, 0xca, 0x09, + 0xed +}; + +/* + * Vector 16 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f1011 + * CTX d069444b7a7e0cab09e24447d24deb1fedbf + * Plaintext length (bytes): 18 + */ +static uint8_t v16_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v16_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v16_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v16_PTX[18] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11 +}; + +static uint8_t v16_CTX[18] = { + 0xd0, 0x69, 0x44, 0x4b, 0x7a, 0x7e, 0x0c, 0xab, + 0x09, 0xe2, 0x44, 0x47, 0xd2, 0x4d, 0xeb, 0x1f, + 0xed, 0xbf +}; + +/* + * Vector 17 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f101112 + * CTX e5df1351c0544ba1350b3363cd8ef4beedbf9d + * Plaintext length (bytes): 19 + */ + +static uint8_t v17_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v17_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v17_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v17_PTX[19] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12 +}; + +static uint8_t v17_CTX[19] = { + 0xe5, 0xdf, 0x13, 0x51, 0xc0, 0x54, 0x4b, 0xa1, + 0x35, 0x0b, 0x33, 0x63, 0xcd, 0x8e, 0xf4, 0xbe, + 0xed, 0xbf, 0x9d +}; + +/* + * Vector 18 + * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0 + * Data unit sequence number 9a78563412 + * PTX 000102030405060708090a0b0c0d0e0f10111213 + * CTX 9d84c813f719aa2c7be3f66171c7c5c2edbf9dac + * Plaintext length (bytes): 20 + */ + +static uint8_t v18_key1[16] = { + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, + 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0 +}; + +static uint8_t v18_key2[16] = { + 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8, + 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0 +}; + +static uint8_t v18_TW[16] = { + 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v18_PTX[20] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13 +}; + +static uint8_t v18_CTX[20] = { + 0x9d, 0x84, 0xc8, 0x13, 0xf7, 0x19, 0xaa, 0x2c, + 0x7b, 0xe3, 0xf6, 0x61, 0x71, 0xc7, 0xc5, 0xc2, + 0xed, 0xbf, 0x9d, 0xac +}; + +/* + * Vector 19 + * Key1 e0e1e2e3e4e5e6e7e8e9eaebecedeeef + * Key2 c0c1c2c3c4c5c6c7c8c9cacbcccdcecf + * Data unit sequence number 21436587a9 + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be6 + * CTX 8b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42cc + * CTX bd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad3854 + * CTX 9c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a177 + * CTX 41990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee3 + * CTX 9936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d4 + * CTX 8b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe + * CTX 41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d + * CTX 19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64 + * CTX a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b88334272 + * CTX 9e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648 + * CTX 346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394 + * CTX d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2 + * CTX cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a + * CTX 5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf + * CTX 92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3 + * Plaintext length (bytes): 512 + */ +static uint8_t v19_key1[16] = { + + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef +}; + +static uint8_t v19_key2[16] = { + + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf +}; + +static uint8_t v19_TW[16] = { + + 0x21, 0x43, 0x65, 0x87, 0xa9, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v19_PTX[512] = { + + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v19_CTX[512] = { + 0x38, 0xb4, 0x58, 0x12, 0xef, 0x43, 0xa0, 0x5b, + 0xd9, 0x57, 0xe5, 0x45, 0x90, 0x7e, 0x22, 0x3b, + 0x95, 0x4a, 0xb4, 0xaa, 0xf0, 0x88, 0x30, 0x3a, + 0xd9, 0x10, 0xea, 0xdf, 0x14, 0xb4, 0x2b, 0xe6, + 0x8b, 0x24, 0x61, 0x14, 0x9d, 0x8c, 0x8b, 0xa8, + 0x5f, 0x99, 0x2b, 0xe9, 0x70, 0xbc, 0x62, 0x1f, + 0x1b, 0x06, 0x57, 0x3f, 0x63, 0xe8, 0x67, 0xbf, + 0x58, 0x75, 0xac, 0xaf, 0xa0, 0x4e, 0x42, 0xcc, + 0xbd, 0x7b, 0xd3, 0xc2, 0xa0, 0xfb, 0x1f, 0xff, + 0x79, 0x1e, 0xc5, 0xec, 0x36, 0xc6, 0x6a, 0xe4, + 0xac, 0x1e, 0x80, 0x6d, 0x81, 0xfb, 0xf7, 0x09, + 0xdb, 0xe2, 0x9e, 0x47, 0x1f, 0xad, 0x38, 0x54, + 0x9c, 0x8e, 0x66, 0xf5, 0x34, 0x5d, 0x7c, 0x1e, + 0xb9, 0x4f, 0x40, 0x5d, 0x1e, 0xc7, 0x85, 0xcc, + 0x6f, 0x6a, 0x68, 0xf6, 0x25, 0x4d, 0xd8, 0x33, + 0x9f, 0x9d, 0x84, 0x05, 0x7e, 0x01, 0xa1, 0x77, + 0x41, 0x99, 0x04, 0x82, 0x99, 0x95, 0x16, 0xb5, + 0x61, 0x1a, 0x38, 0xf4, 0x1b, 0xb6, 0x47, 0x8e, + 0x6f, 0x17, 0x3f, 0x32, 0x08, 0x05, 0xdd, 0x71, + 0xb1, 0x93, 0x2f, 0xc3, 0x33, 0xcb, 0x9e, 0xe3, + 0x99, 0x36, 0xbe, 0xea, 0x9a, 0xd9, 0x6f, 0xa1, + 0x0f, 0xb4, 0x11, 0x2b, 0x90, 0x17, 0x34, 0xdd, + 0xad, 0x40, 0xbc, 0x18, 0x78, 0x99, 0x5f, 0x8e, + 0x11, 0xae, 0xe7, 0xd1, 0x41, 0xa2, 0xf5, 0xd4, + 0x8b, 0x7a, 0x4e, 0x1e, 0x7f, 0x0b, 0x2c, 0x04, + 0x83, 0x0e, 0x69, 0xa4, 0xfd, 0x13, 0x78, 0x41, + 0x1c, 0x2f, 0x28, 0x7e, 0xdf, 0x48, 0xc6, 0xc4, + 0xe5, 0xc2, 0x47, 0xa1, 0x96, 0x80, 0xf7, 0xfe, + 0x41, 0xce, 0xfb, 0xd4, 0x9b, 0x58, 0x21, 0x06, + 0xe3, 0x61, 0x6c, 0xbb, 0xe4, 0xdf, 0xb2, 0x34, + 0x4b, 0x2a, 0xe9, 0x51, 0x93, 0x91, 0xf3, 0xe0, + 0xfb, 0x49, 0x22, 0x25, 0x4b, 0x1d, 0x6d, 0x2d, + 0x19, 0xc6, 0xd4, 0xd5, 0x37, 0xb3, 0xa2, 0x6f, + 0x3b, 0xcc, 0x51, 0x58, 0x8b, 0x32, 0xf3, 0xec, + 0xa0, 0x82, 0x9b, 0x6a, 0x5a, 0xc7, 0x25, 0x78, + 0xfb, 0x81, 0x4f, 0xb4, 0x3c, 0xf8, 0x0d, 0x64, + 0xa2, 0x33, 0xe3, 0xf9, 0x97, 0xa3, 0xf0, 0x26, + 0x83, 0x34, 0x2f, 0x2b, 0x33, 0xd2, 0x5b, 0x49, + 0x25, 0x36, 0xb9, 0x3b, 0xec, 0xb2, 0xf5, 0xe1, + 0xa8, 0xb8, 0x2f, 0x5b, 0x88, 0x33, 0x42, 0x72, + 0x9e, 0x8a, 0xe0, 0x9d, 0x16, 0x93, 0x88, 0x41, + 0xa2, 0x1a, 0x97, 0xfb, 0x54, 0x3e, 0xea, 0x3b, + 0xbf, 0xf5, 0x9f, 0x13, 0xc1, 0xa1, 0x84, 0x49, + 0xe3, 0x98, 0x70, 0x1c, 0x1a, 0xd5, 0x16, 0x48, + 0x34, 0x6c, 0xbc, 0x04, 0xc2, 0x7b, 0xb2, 0xda, + 0x3b, 0x93, 0xa1, 0x37, 0x2c, 0xca, 0xe5, 0x48, + 0xfb, 0x53, 0xbe, 0xe4, 0x76, 0xf9, 0xe9, 0xc9, + 0x17, 0x73, 0xb1, 0xbb, 0x19, 0x82, 0x83, 0x94, + 0xd5, 0x5d, 0x3e, 0x1a, 0x20, 0xed, 0x69, 0x11, + 0x3a, 0x86, 0x0b, 0x68, 0x29, 0xff, 0xa8, 0x47, + 0x22, 0x46, 0x04, 0x43, 0x50, 0x70, 0x22, 0x1b, + 0x25, 0x7e, 0x8d, 0xff, 0x78, 0x36, 0x15, 0xd2, + 0xca, 0xe4, 0x80, 0x3a, 0x93, 0xaa, 0x43, 0x34, + 0xab, 0x48, 0x2a, 0x0a, 0xfa, 0xc9, 0xc0, 0xae, + 0xda, 0x70, 0xb4, 0x5a, 0x48, 0x1d, 0xf5, 0xde, + 0xc5, 0xdf, 0x8c, 0xc0, 0xf4, 0x23, 0xc7, 0x7a, + 0x5f, 0xd4, 0x6c, 0xd3, 0x12, 0x02, 0x1d, 0x4b, + 0x43, 0x88, 0x62, 0x41, 0x9a, 0x79, 0x1b, 0xe0, + 0x3b, 0xb4, 0xd9, 0x7c, 0x0e, 0x59, 0x57, 0x85, + 0x42, 0x53, 0x1b, 0xa4, 0x66, 0xa8, 0x3b, 0xaf, + 0x92, 0xce, 0xfc, 0x15, 0x1b, 0x5c, 0xc1, 0x61, + 0x1a, 0x16, 0x78, 0x93, 0x81, 0x9b, 0x63, 0xfb, + 0x8a, 0x6b, 0x18, 0xe8, 0x6d, 0xe6, 0x02, 0x90, + 0xfa, 0x72, 0xb7, 0x97, 0xb0, 0xce, 0x59, 0xf3 +}; + +// Define vector of structs, with pointers to the statically defined vectors + +struct xts_vector vlist[NVEC] = { + + // pointers to the statically defined vectors here + + // Vector 1 + {sizeof(v1_CTX), v1_key1, v1_key2, v1_TW, v1_PTX, v1_CTX} + , + // Vector 2 + {sizeof(v2_CTX), v2_key1, v2_key2, v2_TW, v2_PTX, v2_CTX} + , + // Vector 3 + {sizeof(v3_CTX), v3_key1, v3_key2, v3_TW, v3_PTX, v3_CTX} + , + // Vector 4 + {sizeof(v4_CTX), v4_key1, v4_key2, v4_TW, v4_PTX, v4_CTX} + , + // Vector 5 + {sizeof(v5_CTX), v5_key1, v5_key2, v5_TW, v5_PTX, v5_CTX} + , + // Vector 6 + {sizeof(v6_CTX), v6_key1, v6_key2, v6_TW, v6_PTX, v6_CTX} + , + // Vector 7 + {sizeof(v7_CTX), v7_key1, v7_key2, v7_TW, v7_PTX, v7_CTX} + , + // Vector 8 + {sizeof(v8_CTX), v8_key1, v8_key2, v8_TW, v8_PTX, v8_CTX} + , + // Vector 9 + {sizeof(v9_CTX), v9_key1, v9_key2, v9_TW, v9_PTX, v9_CTX} + , + // Vector 15 + {sizeof(v15_CTX), v15_key1, v15_key2, v15_TW, v15_PTX, v15_CTX} + , + // Vector 16 + {sizeof(v16_CTX), v16_key1, v16_key2, v16_TW, v16_PTX, v16_CTX} + , + // Vector 17 + {sizeof(v17_CTX), v17_key1, v17_key2, v17_TW, v17_PTX, v17_CTX} + , + // Vector 18 + {sizeof(v18_CTX), v18_key1, v18_key2, v18_TW, v18_PTX, v18_CTX} + , + // Vector 19 + {sizeof(v19_CTX), v19_key1, v19_key2, v19_TW, v19_PTX, v19_CTX} + +}; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c new file mode 100644 index 000000000..5bccd4a5c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c @@ -0,0 +1,145 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include <openssl/evp.h> + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + printf("\n ERROR!! \n"); + if (!EVP_DecryptFinal_ex(ctx, ct + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *dt, *refdt; + struct perf start, stop; + unsigned char keyssl[64]; /* SSL takes both keys together */ + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_256_dec_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + /* Set up key for the SSL engine */ + for (i = 0; i < 32; i++) { + keyssl[i] = key1[i]; + keyssl[i + 32] = key2[i]; + } + + /* Encrypt and compare decrypted output */ + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt); + if (memcmp(dt, refdt, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + perf_stop(&stop); + printf("aes_xts_256_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL decryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt); + perf_stop(&stop); + printf("aes_xts_256_openssl_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c new file mode 100644 index 000000000..ff3d62e93 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c @@ -0,0 +1,126 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *dt; + uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15]; + uint8_t expkey1_dec[16 * 15], null_key[16 * 15]; + + printf("aes_xts_256_dec_perf:\n"); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Decode perf test */ + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + } + + perf_stop(&stop); + + printf("aes_xts_256_dec" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_256(key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(key2, expkey2_enc, null_key); + XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, + pt); + } + + perf_stop(&stop); + + printf("aes_xts_256_dec_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c new file mode 100644 index 000000000..8d477ca89 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c @@ -0,0 +1,145 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "test.h" + +#include <openssl/evp.h> + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 400000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 50 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +static inline + int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + printf("\n ERROR!! \n"); + if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen)) + printf("\n ERROR!! \n"); + + return 0; +} + +int main(void) +{ + int i; + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *refct; + struct perf start, stop; + unsigned char keyssl[64]; /* SSL takes both keys together */ + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + printf("aes_xts_256_enc_perf:\n"); + + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == refct) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (i = 0; i < 32; i++) { + keyssl[i] = key1[i]; + keyssl[i + 32] = key2[i]; + } + + /* Encrypt and compare output */ + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + if (memcmp(ct, refct, TEST_LEN)) { + printf("ISA-L and OpenSSL results don't match\n"); + return -1; + } + + /* Time ISA-L encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + perf_stop(&stop); + + printf("aes_xts_256_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Time OpenSSL encryption */ + perf_start(&start); + for (i = 0; i < TEST_LOOPS; i++) + openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct); + perf_stop(&stop); + + printf("aes_xts_256_ossl_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + EVP_CIPHER_CTX_free(ctx); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c new file mode 100644 index 000000000..051dd0a0e --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c @@ -0,0 +1,124 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include "aes_xts.h" +#include "aes_keyexp.h" +#include "test.h" + +//#define CACHED_TEST +#ifdef CACHED_TEST +// Cached test, loop many times over small dataset +# define TEST_LEN 8*1024 +# define TEST_LOOPS 3000000 +# define TEST_TYPE_STR "_warm" +#else +// Uncached test. Pull from large mem base. +# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */ +# define TEST_LEN (2 * GT_L3_CACHE) +# define TEST_LOOPS 400 +# define TEST_TYPE_STR "_cold" +#endif + +#define TEST_MEM TEST_LEN + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int i; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct; + uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15]; + uint8_t expkey1_dec[16 * 15], null_key[16 * 15]; + + printf("aes_xts_256_enc_perf:\n"); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct) { + printf("malloc of testsize failed\n"); + return -1; + } + + /* Encode perf test */ + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + + struct perf start, stop; + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + } + + perf_stop(&stop); + + printf("aes_xts_256_enc" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + /* Expanded keys perf test */ + + aes_keyexp_256(key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(key2, expkey2_enc, null_key); + XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct); + + perf_start(&start); + + for (i = 0; i < TEST_LOOPS; i++) { + XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, + ct); + } + + perf_stop(&stop); + + printf("aes_xts_256_enc_expanded_key" TEST_TYPE_STR ": "); + perf_print(stop, start, (long long)TEST_LEN * i); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c new file mode 100644 index 000000000..c8d664a8b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c @@ -0,0 +1,113 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <aes_keyexp.h> +#include "xts_256_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + // Arrays for expanded keys, null_key is a dummy vector (decrypt key not + // needed for the tweak part of the decryption) + uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15]; + uint8_t expkey1_dec[16 * 15], null_key[16 * 15]; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + printf("Can't allocate ciphertext memory\n"); + return -1; + } + // Pre-expand our keys (will only use the encryption ones here) + aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key); + + XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + printf("\nXTS_AES_256_enc: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated plaintext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + printf("Can't allocate plaintext memory\n"); + return -1; + } + // Pre-expand keys for the decryption + aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec); + aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key); + + // Note, encryption key is re-used for the tweak decryption step + XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + printf("\nXTS_AES_256_dec: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c new file mode 100644 index 000000000..5ad7359cc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c @@ -0,0 +1,249 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> // for rand +#include <string.h> // for memcmp +#include <aes_xts.h> +#include <aes_keyexp.h> + +#define TEST_LEN (1024*1024) +#define TEST_SIZE (4096) +#ifndef RANDOMS +# define RANDOMS 10 +#endif + +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +int main(void) +{ + int t, n; + + unsigned char key1[16 * 2], key2[16 * 2], tinit[16]; + unsigned char *pt, *ct, *dt; + + int align, size, min_size; + unsigned char *efence_pt; + unsigned char *efence_ct; + unsigned char *efence_dt; + + unsigned char *origin_pt; + unsigned char *origin_ct; + unsigned char *origin_dt; + + unsigned char key1_exp_enc[16 * 15], key1_exp_dec[16 * 15]; + unsigned char key2_exp_tw[16 * 15]; + int i; + printf("aes_xts_256 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN); + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt) { + printf("malloc of testsize failed\n"); + return -1; + } + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + + if (memcmp(pt, dt, TEST_LEN)) { + printf("fail\n"); + return -1; + } + putchar('.'); + + // Do tests with random data, keys and message size + for (t = 0; t < RANDOMS; t++) { + n = rand() % (TEST_LEN); + if (n < 17) + continue; + + xts256_mk_rand_data(key1, key2, tinit, pt, n); + XTS_AES_256_enc(key2, key1, tinit, n, pt, ct); + XTS_AES_256_dec(key2, key1, tinit, n, ct, dt); + + if (memcmp(pt, dt, n)) { + printf("fail rand %d, size %d\n", t, n); + return -1; + } + putchar('.'); + fflush(0); + } + + // Run tests at end of buffer for Electric Fence + align = 1; + min_size = 16; + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + origin_pt = malloc(TEST_LEN); + origin_ct = malloc(TEST_LEN); + origin_dt = malloc(TEST_LEN); + if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) { + printf("malloc of testsize failed\n"); + return -1; + } + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + for (i = 0; i < 16 * 15; i++) { + key2_exp_tw[i] = rand(); + } + + for (size = 0; size <= TEST_SIZE - min_size; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) { + printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size); + return -1; + } + putchar('.'); + fflush(0); + } + + // For data lengths from 0 to 15 bytes, the functions return without any error + // codes, without reading or writing any data. + for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) { + + // Line up TEST_SIZE from end + efence_pt = pt + TEST_LEN - TEST_SIZE + size; + efence_ct = ct + TEST_LEN - TEST_SIZE + size; + efence_dt = dt + TEST_LEN - TEST_SIZE + size; + + xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size); + memcpy(efence_ct, efence_pt, TEST_SIZE - size); + memcpy(efence_dt, efence_pt, TEST_SIZE - size); + memcpy(origin_pt, efence_pt, TEST_SIZE - size); + memcpy(origin_ct, efence_ct, TEST_SIZE - size); + memcpy(origin_dt, efence_dt, TEST_SIZE - size); + + aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec); + + XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit, + TEST_SIZE - size, efence_pt, efence_ct); + XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit, + TEST_SIZE - size, efence_ct, efence_dt); + + if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) { + printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) { + printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size); + return -1; + } + if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) { + printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size); + return -1; + } + + putchar('.'); + fflush(0); + } + + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c new file mode 100644 index 000000000..6b25277dc --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c @@ -0,0 +1,273 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" +#include <stdlib.h> +#include <openssl/evp.h> + +#ifndef TEST_SEED +# define TEST_SEED 0x1234 +#endif +#ifndef RANDOMS +# define RANDOMS 128 +#endif +#define TEST_LOOPS 128 +#define TEST_LEN (1024*1024) +#define LENGTH_SCAN (2*1024) + +/* Generates random data for keys, tweak and plaintext */ +void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t, + unsigned char *p, int n) +{ + int i; + for (i = 0; i < 32; i++) { + *k1++ = rand(); + *k2++ = rand(); + } + for (i = 0; i < 16; i++) + *t++ = rand(); + + for (i = 0; i < n; i++) + *p++ = rand(); + +} + +/* Wrapper for OpenSSL EVP AES-XTS 256 encryption */ +static inline + int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *pt, unsigned char *ct) +{ + int outlen, tmplen; + if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv) + || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len)) + || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) { + printf("\n Error in openssl encoding of %d bytes\n", len); + return 1; + } + return 0; +} + +/* Wrapper for OpenSSL EVP AES-XTS 256 decryption */ +static inline + int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv, + int len, unsigned char *ct, unsigned char *dt) +{ + int outlen, tmplen; + if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv) + || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len)) + || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) { + printf("\n Error in openssl decoding of %d bytes\n", len); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + + unsigned char key1[32], key2[32], tinit[16]; + unsigned char *pt, *ct, *dt, *refct, *refdt; + unsigned char keyssl[64]; /* SSL takes both keys together */ + int i, j, k, ret; + int seed; + + if (argc == 1) + seed = TEST_SEED; + else + seed = atoi(argv[1]); + + srand(seed); + printf("SEED: %d\n", seed); + + /* Initialise our cipher context, which can use same input vectors */ + EVP_CIPHER_CTX *ctx; + ctx = EVP_CIPHER_CTX_new(); + + /* Allocate space for input and output buffers */ + pt = malloc(TEST_LEN); + ct = malloc(TEST_LEN); + dt = malloc(TEST_LEN); + refct = malloc(TEST_LEN); + refdt = malloc(TEST_LEN); + + if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) { + printf("malloc of testsize failed\n"); + return -1; + } + + /**************************** LENGTH SCAN TEST *************************/ + printf("aes_xts_256_rand_ossl test, %d sets of various length: ", 2 * 1024); + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 32; k++) { + keyssl[k] = key1[k]; + keyssl[k + 32] = key2[k]; + } + + for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) { + + /* Encrypt using each method */ + XTS_AES_256_enc(key2, key1, tinit, i, pt, ct); + ret |= openssl_aes_256_xts_enc(ctx, keyssl, tinit, i, pt, refct); + + // Compare + for (ret = 0, j = 0; j < i && ret == 0; j++) { + if (ct[j] != refct[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_256_enc size=%d failed at byte %d!\n", i, j); + + /* Decrypt using each method */ + XTS_AES_256_dec(key2, key1, tinit, i, ct, dt); + ret |= openssl_aes_256_xts_dec(ctx, keyssl, tinit, i, refct, refdt); + + for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) { + if (dt[j] != refdt[j]) + ret = 1; + } + if (ret) + printf(" XTS_AES_256_dec size=%d failed at byte %d!\n", i, j); + if (0 == i % (LENGTH_SCAN / 16)) + printf("."); + fflush(0); + } + if (ret) + return -1; + printf("Pass\n"); + + /**************************** FIXED LENGTH TEST *************************/ + printf("aes_xts_256_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN); + + /* Loop over the vectors */ + for (i = 0; i < TEST_LOOPS; i++) { + + xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN); + + /* Set up key for the SSL engine */ + for (k = 0; k < 32; k++) { + keyssl[k] = key1[k]; + keyssl[k + 32] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct); + if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct)) + return -1; + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < TEST_LEN; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_256_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt); + if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt)) + return -1; + + for (j = 0; j < TEST_LEN; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_256_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == i % (TEST_LOOPS / 16)) + printf("."); + fflush(0); + } + printf("Pass\n"); + + /**************************** RANDOM LENGTH TEST *************************/ + printf("aes_xts_256_rand_ossl test, %d sets of random lengths: ", RANDOMS); + + /* Run tests with random size */ + + unsigned int rand_len, t; + + for (t = 0; t < RANDOMS; t++) { + + rand_len = rand() % (TEST_LEN); + rand_len = rand_len < 16 ? 16 : rand_len; + xts256_mk_rand_data(key1, key2, tinit, pt, rand_len); + + /* Set up key for the SSL engine */ + for (k = 0; k < 32; k++) { + keyssl[k] = key1[k]; + keyssl[k + 32] = key2[k]; + } + + /* Encrypt using each method */ + XTS_AES_256_enc(key2, key1, tinit, rand_len, pt, ct); + if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct)) + return -1; + + /* Carry out comparison of the calculated ciphertext with + * the reference + */ + for (j = 0; j < rand_len; j++) { + + if (ct[j] != refct[j]) { + printf("XTS_AES_256_enc failed at byte %d! \n", j); + return -1; + } + } + + /* Decrypt using each method */ + XTS_AES_256_dec(key2, key1, tinit, rand_len, ct, dt); + if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt)) + return -1; + + for (j = 0; j < rand_len; j++) { + + if (dt[j] != refdt[j]) { + printf("XTS_AES_256_dec failed at byte %d! \n", j); + return -1; + } + } + if (0 == t % (RANDOMS / 16)) + printf("."); + fflush(0); + } + + EVP_CIPHER_CTX_free(ctx); + + printf("Pass\n"); + + printf("aes_xts_256_rand_ossl: All tests passed\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c new file mode 100644 index 000000000..2c961f44f --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c @@ -0,0 +1,105 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include "xts_256_vect.h" + +int main(void) +{ + + // Temporary array for the calculated vectors + uint8_t *ct_test; + uint8_t *pt_test; + + int i, j; + + // --- Encryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + ct_test = malloc(vlist[i].ptlen); + if (ct_test == NULL) { + fprintf(stderr, "Can't allocate ciphertext memory\n"); + return -1; + } + + XTS_AES_256_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].PTX, ct_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (ct_test[j] != vlist[i].CTX[j]) { + printf("\nXTS_AES_256_enc: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + + ct_test = NULL; + } + + // --- Decryption test --- + + // Loop over the vectors + for (i = 0; i < NVEC; i++) { + + // Allocate space for the calculated ciphertext + pt_test = malloc(vlist[i].ptlen); + if (pt_test == NULL) { + fprintf(stderr, "Can't allocate plaintext memory\n"); + return -1; + } + + XTS_AES_256_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW, + vlist[i].ptlen, vlist[i].CTX, pt_test); + + // Carry out comparison of the calculated ciphertext with + // the reference + for (j = 0; j < vlist[i].ptlen; j++) { + + if (pt_test[j] != vlist[i].PTX[j]) { + printf("\nXTS_AES_256_dec: Vector %d: ", i + 10); + printf("failed at byte %d! \n", j); + return -1; + } + } + printf("."); + + pt_test = NULL; + } + printf("Pass\n"); + + return 0; +} diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h new file mode 100644 index 000000000..5a893f173 --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h @@ -0,0 +1,1035 @@ +/********************************************************************** + Copyright(c) 2011-2016 Intel Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "aes_xts.h" + +#define NVEC 5 + +// struct to hold pointers to the key, plaintext and ciphertext vectors +struct xts_vector { + uint64_t ptlen; // length of our plaintext + uint8_t *key1; // dimension 16 for 128 bit aes + uint8_t *key2; // dimension 16 for 128 bit aes + uint8_t *TW; // dimension 16 for both 128 and 256 bit + uint8_t *PTX; // min. dimension 16 + uint8_t *CTX; // same dimension as PTX +}; + +/* Define our test vectors statically here. Test vectors are from the standard: + * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented + * Storage Devices" + * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450 + * + * Vector 10 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b + * CTX 5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd + * CTX 5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0 + * CTX c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca + * CTX 2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0 + * CTX b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f + * CTX 93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec + * CTX 583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a + * CTX 84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1 + * CTX 505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae + * CTX 9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29 + * CTX a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac + * CTX 6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f + * CTX 645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed4385 + * CTX 1ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa + * CTX 773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151 + * Plaintext length (bytes): 512 + */ + +static uint8_t v10_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v10_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v10_TW[16] = { + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v10_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v10_CTX[512] = { + 0x1c, 0x3b, 0x3a, 0x10, 0x2f, 0x77, 0x03, 0x86, + 0xe4, 0x83, 0x6c, 0x99, 0xe3, 0x70, 0xcf, 0x9b, + 0xea, 0x00, 0x80, 0x3f, 0x5e, 0x48, 0x23, 0x57, + 0xa4, 0xae, 0x12, 0xd4, 0x14, 0xa3, 0xe6, 0x3b, + 0x5d, 0x31, 0xe2, 0x76, 0xf8, 0xfe, 0x4a, 0x8d, + 0x66, 0xb3, 0x17, 0xf9, 0xac, 0x68, 0x3f, 0x44, + 0x68, 0x0a, 0x86, 0xac, 0x35, 0xad, 0xfc, 0x33, + 0x45, 0xbe, 0xfe, 0xcb, 0x4b, 0xb1, 0x88, 0xfd, + 0x57, 0x76, 0x92, 0x6c, 0x49, 0xa3, 0x09, 0x5e, + 0xb1, 0x08, 0xfd, 0x10, 0x98, 0xba, 0xec, 0x70, + 0xaa, 0xa6, 0x69, 0x99, 0xa7, 0x2a, 0x82, 0xf2, + 0x7d, 0x84, 0x8b, 0x21, 0xd4, 0xa7, 0x41, 0xb0, + 0xc5, 0xcd, 0x4d, 0x5f, 0xff, 0x9d, 0xac, 0x89, + 0xae, 0xba, 0x12, 0x29, 0x61, 0xd0, 0x3a, 0x75, + 0x71, 0x23, 0xe9, 0x87, 0x0f, 0x8a, 0xcf, 0x10, + 0x00, 0x02, 0x08, 0x87, 0x89, 0x14, 0x29, 0xca, + 0x2a, 0x3e, 0x7a, 0x7d, 0x7d, 0xf7, 0xb1, 0x03, + 0x55, 0x16, 0x5c, 0x8b, 0x9a, 0x6d, 0x0a, 0x7d, + 0xe8, 0xb0, 0x62, 0xc4, 0x50, 0x0d, 0xc4, 0xcd, + 0x12, 0x0c, 0x0f, 0x74, 0x18, 0xda, 0xe3, 0xd0, + 0xb5, 0x78, 0x1c, 0x34, 0x80, 0x3f, 0xa7, 0x54, + 0x21, 0xc7, 0x90, 0xdf, 0xe1, 0xde, 0x18, 0x34, + 0xf2, 0x80, 0xd7, 0x66, 0x7b, 0x32, 0x7f, 0x6c, + 0x8c, 0xd7, 0x55, 0x7e, 0x12, 0xac, 0x3a, 0x0f, + 0x93, 0xec, 0x05, 0xc5, 0x2e, 0x04, 0x93, 0xef, + 0x31, 0xa1, 0x2d, 0x3d, 0x92, 0x60, 0xf7, 0x9a, + 0x28, 0x9d, 0x6a, 0x37, 0x9b, 0xc7, 0x0c, 0x50, + 0x84, 0x14, 0x73, 0xd1, 0xa8, 0xcc, 0x81, 0xec, + 0x58, 0x3e, 0x96, 0x45, 0xe0, 0x7b, 0x8d, 0x96, + 0x70, 0x65, 0x5b, 0xa5, 0xbb, 0xcf, 0xec, 0xc6, + 0xdc, 0x39, 0x66, 0x38, 0x0a, 0xd8, 0xfe, 0xcb, + 0x17, 0xb6, 0xba, 0x02, 0x46, 0x9a, 0x02, 0x0a, + 0x84, 0xe1, 0x8e, 0x8f, 0x84, 0x25, 0x20, 0x70, + 0xc1, 0x3e, 0x9f, 0x1f, 0x28, 0x9b, 0xe5, 0x4f, + 0xbc, 0x48, 0x14, 0x57, 0x77, 0x8f, 0x61, 0x60, + 0x15, 0xe1, 0x32, 0x7a, 0x02, 0xb1, 0x40, 0xf1, + 0x50, 0x5e, 0xb3, 0x09, 0x32, 0x6d, 0x68, 0x37, + 0x8f, 0x83, 0x74, 0x59, 0x5c, 0x84, 0x9d, 0x84, + 0xf4, 0xc3, 0x33, 0xec, 0x44, 0x23, 0x88, 0x51, + 0x43, 0xcb, 0x47, 0xbd, 0x71, 0xc5, 0xed, 0xae, + 0x9b, 0xe6, 0x9a, 0x2f, 0xfe, 0xce, 0xb1, 0xbe, + 0xc9, 0xde, 0x24, 0x4f, 0xbe, 0x15, 0x99, 0x2b, + 0x11, 0xb7, 0x7c, 0x04, 0x0f, 0x12, 0xbd, 0x8f, + 0x6a, 0x97, 0x5a, 0x44, 0xa0, 0xf9, 0x0c, 0x29, + 0xa9, 0xab, 0xc3, 0xd4, 0xd8, 0x93, 0x92, 0x72, + 0x84, 0xc5, 0x87, 0x54, 0xcc, 0xe2, 0x94, 0x52, + 0x9f, 0x86, 0x14, 0xdc, 0xd2, 0xab, 0xa9, 0x91, + 0x92, 0x5f, 0xed, 0xc4, 0xae, 0x74, 0xff, 0xac, + 0x6e, 0x33, 0x3b, 0x93, 0xeb, 0x4a, 0xff, 0x04, + 0x79, 0xda, 0x9a, 0x41, 0x0e, 0x44, 0x50, 0xe0, + 0xdd, 0x7a, 0xe4, 0xc6, 0xe2, 0x91, 0x09, 0x00, + 0x57, 0x5d, 0xa4, 0x01, 0xfc, 0x07, 0x05, 0x9f, + 0x64, 0x5e, 0x8b, 0x7e, 0x9b, 0xfd, 0xef, 0x33, + 0x94, 0x30, 0x54, 0xff, 0x84, 0x01, 0x14, 0x93, + 0xc2, 0x7b, 0x34, 0x29, 0xea, 0xed, 0xb4, 0xed, + 0x53, 0x76, 0x44, 0x1a, 0x77, 0xed, 0x43, 0x85, + 0x1a, 0xd7, 0x7f, 0x16, 0xf5, 0x41, 0xdf, 0xd2, + 0x69, 0xd5, 0x0d, 0x6a, 0x5f, 0x14, 0xfb, 0x0a, + 0xab, 0x1c, 0xbb, 0x4c, 0x15, 0x50, 0xbe, 0x97, + 0xf7, 0xab, 0x40, 0x66, 0x19, 0x3c, 0x4c, 0xaa, + 0x77, 0x3d, 0xad, 0x38, 0x01, 0x4b, 0xd2, 0x09, + 0x2f, 0xa7, 0x55, 0xc8, 0x24, 0xbb, 0x5e, 0x54, + 0xc4, 0xf3, 0x6f, 0xfd, 0xa9, 0xfc, 0xea, 0x70, + 0xb9, 0xc6, 0xe6, 0x93, 0xe1, 0x48, 0xc1, 0x51 +}; + +/* + * Vector 11 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11 + * CTX e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2 + * CTX 469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d + * CTX 3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b201 + * CTX 3fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca179 + * CTX 8d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922a + * CTX aa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c6 + * CTX 9aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80 + * CTX 470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888a + * CTX bf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab0 + * CTX 18bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75 + * CTX bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63 + * CTX 094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a + * CTX 7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca68 + * CTX 6e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc090 + * CTX 54fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803 + * Plaintext length (bytes): 512 + * +*/ +static uint8_t v11_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v11_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v11_TW[16] = { + 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v11_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v11_CTX[512] = { + 0x77, 0xa3, 0x12, 0x51, 0x61, 0x8a, 0x15, 0xe6, + 0xb9, 0x2d, 0x1d, 0x66, 0xdf, 0xfe, 0x7b, 0x50, + 0xb5, 0x0b, 0xad, 0x55, 0x23, 0x05, 0xba, 0x02, + 0x17, 0xa6, 0x10, 0x68, 0x8e, 0xff, 0x7e, 0x11, + 0xe1, 0xd0, 0x22, 0x54, 0x38, 0xe0, 0x93, 0x24, + 0x2d, 0x6d, 0xb2, 0x74, 0xfd, 0xe8, 0x01, 0xd4, + 0xca, 0xe0, 0x6f, 0x20, 0x92, 0xc7, 0x28, 0xb2, + 0x47, 0x85, 0x59, 0xdf, 0x58, 0xe8, 0x37, 0xc2, + 0x46, 0x9e, 0xe4, 0xa4, 0xfa, 0x79, 0x4e, 0x4b, + 0xbc, 0x7f, 0x39, 0xbc, 0x02, 0x6e, 0x3c, 0xb7, + 0x2c, 0x33, 0xb0, 0x88, 0x8f, 0x25, 0xb4, 0xac, + 0xf5, 0x6a, 0x2a, 0x98, 0x04, 0xf1, 0xce, 0x6d, + 0x3d, 0x6e, 0x1d, 0xc6, 0xca, 0x18, 0x1d, 0x4b, + 0x54, 0x61, 0x79, 0xd5, 0x55, 0x44, 0xaa, 0x77, + 0x60, 0xc4, 0x0d, 0x06, 0x74, 0x15, 0x39, 0xc7, + 0xe3, 0xcd, 0x9d, 0x2f, 0x66, 0x50, 0xb2, 0x01, + 0x3f, 0xd0, 0xee, 0xb8, 0xc2, 0xb8, 0xe3, 0xd8, + 0xd2, 0x40, 0xcc, 0xae, 0x2d, 0x4c, 0x98, 0x32, + 0x0a, 0x74, 0x42, 0xe1, 0xc8, 0xd7, 0x5a, 0x42, + 0xd6, 0xe6, 0xcf, 0xa4, 0xc2, 0xec, 0xa1, 0x79, + 0x8d, 0x15, 0x8c, 0x7a, 0xec, 0xdf, 0x82, 0x49, + 0x0f, 0x24, 0xbb, 0x9b, 0x38, 0xe1, 0x08, 0xbc, + 0xda, 0x12, 0xc3, 0xfa, 0xf9, 0xa2, 0x11, 0x41, + 0xc3, 0x61, 0x3b, 0x58, 0x36, 0x7f, 0x92, 0x2a, + 0xaa, 0x26, 0xcd, 0x22, 0xf2, 0x3d, 0x70, 0x8d, + 0xae, 0x69, 0x9a, 0xd7, 0xcb, 0x40, 0xa8, 0xad, + 0x0b, 0x6e, 0x27, 0x84, 0x97, 0x3d, 0xcb, 0x60, + 0x56, 0x84, 0xc0, 0x8b, 0x8d, 0x69, 0x98, 0xc6, + 0x9a, 0xac, 0x04, 0x99, 0x21, 0x87, 0x1e, 0xbb, + 0x65, 0x30, 0x1a, 0x46, 0x19, 0xca, 0x80, 0xec, + 0xb4, 0x85, 0xa3, 0x1d, 0x74, 0x42, 0x23, 0xce, + 0x8d, 0xdc, 0x23, 0x94, 0x82, 0x8d, 0x6a, 0x80, + 0x47, 0x0c, 0x09, 0x2f, 0x5b, 0xa4, 0x13, 0xc3, + 0x37, 0x8f, 0xa6, 0x05, 0x42, 0x55, 0xc6, 0xf9, + 0xdf, 0x44, 0x95, 0x86, 0x2b, 0xbb, 0x32, 0x87, + 0x68, 0x1f, 0x93, 0x1b, 0x68, 0x7c, 0x88, 0x8a, + 0xbf, 0x84, 0x4d, 0xfc, 0x8f, 0xc2, 0x83, 0x31, + 0xe5, 0x79, 0x92, 0x8c, 0xd1, 0x2b, 0xd2, 0x39, + 0x0a, 0xe1, 0x23, 0xcf, 0x03, 0x81, 0x8d, 0x14, + 0xde, 0xdd, 0xe5, 0xc0, 0xc2, 0x4c, 0x8a, 0xb0, + 0x18, 0xbf, 0xca, 0x75, 0xca, 0x09, 0x6f, 0x2d, + 0x53, 0x1f, 0x3d, 0x16, 0x19, 0xe7, 0x85, 0xf1, + 0xad, 0xa4, 0x37, 0xca, 0xb9, 0x2e, 0x98, 0x05, + 0x58, 0xb3, 0xdc, 0xe1, 0x47, 0x4a, 0xfb, 0x75, + 0xbf, 0xed, 0xbf, 0x8f, 0xf5, 0x4c, 0xb2, 0x61, + 0x8e, 0x02, 0x44, 0xc9, 0xac, 0x0d, 0x3c, 0x66, + 0xfb, 0x51, 0x59, 0x8c, 0xd2, 0xdb, 0x11, 0xf9, + 0xbe, 0x39, 0x79, 0x1a, 0xbe, 0x44, 0x7c, 0x63, + 0x09, 0x4f, 0x7c, 0x45, 0x3b, 0x7f, 0xf8, 0x7c, + 0xb5, 0xbb, 0x36, 0xb7, 0xc7, 0x9e, 0xfb, 0x08, + 0x72, 0xd1, 0x70, 0x58, 0xb8, 0x3b, 0x15, 0xab, + 0x08, 0x66, 0xad, 0x8a, 0x58, 0x65, 0x6c, 0x5a, + 0x7e, 0x20, 0xdb, 0xdf, 0x30, 0x8b, 0x24, 0x61, + 0xd9, 0x7c, 0x0e, 0xc0, 0x02, 0x4a, 0x27, 0x15, + 0x05, 0x52, 0x49, 0xcf, 0x3b, 0x47, 0x8d, 0xdd, + 0x47, 0x40, 0xde, 0x65, 0x4f, 0x75, 0xca, 0x68, + 0x6e, 0x0d, 0x73, 0x45, 0xc6, 0x9e, 0xd5, 0x0c, + 0xdc, 0x2a, 0x8b, 0x33, 0x2b, 0x1f, 0x88, 0x24, + 0x10, 0x8a, 0xc9, 0x37, 0xeb, 0x05, 0x05, 0x85, + 0x60, 0x8e, 0xe7, 0x34, 0x09, 0x7f, 0xc0, 0x90, + 0x54, 0xfb, 0xff, 0x89, 0xee, 0xae, 0xea, 0x79, + 0x1f, 0x4a, 0x7a, 0xb1, 0xf9, 0x86, 0x82, 0x94, + 0xa4, 0xf9, 0xe2, 0x7b, 0x42, 0xaf, 0x81, 0x00, + 0xcb, 0x9d, 0x59, 0xce, 0xf9, 0x64, 0x58, 0x03 +}; + +/* + * Vector 12 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200 + * CTX a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba049 + * CTX 9298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a8 + * CTX 7815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e1 + * CTX 9110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf + * CTX 5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e + * CTX 62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6 + * CTX e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879 + * CTX a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe0 + * CTX 97b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964 + * CTX c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799d + * CTX a3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7a + * CTX f08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2 + * CTX 480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace + * CTX 231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36 + * CTX ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826 + * Plaintext length (bytes): 512 +*/ + +static uint8_t v12_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v12_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v12_TW[16] = { + 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v12_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v12_CTX[512] = { + 0xe3, 0x87, 0xaa, 0xa5, 0x8b, 0xa4, 0x83, 0xaf, + 0xa7, 0xe8, 0xeb, 0x46, 0x97, 0x78, 0x31, 0x7e, + 0xcf, 0x4c, 0xf5, 0x73, 0xaa, 0x9d, 0x4e, 0xac, + 0x23, 0xf2, 0xcd, 0xf9, 0x14, 0xe4, 0xe2, 0x00, + 0xa8, 0xb4, 0x90, 0xe4, 0x2e, 0xe6, 0x46, 0x80, + 0x2d, 0xc6, 0xee, 0x2b, 0x47, 0x1b, 0x27, 0x81, + 0x95, 0xd6, 0x09, 0x18, 0xec, 0xec, 0xb4, 0x4b, + 0xf7, 0x99, 0x66, 0xf8, 0x3f, 0xab, 0xa0, 0x49, + 0x92, 0x98, 0xeb, 0xc6, 0x99, 0xc0, 0xc8, 0x63, + 0x47, 0x15, 0xa3, 0x20, 0xbb, 0x4f, 0x07, 0x5d, + 0x62, 0x2e, 0x74, 0xc8, 0xc9, 0x32, 0x00, 0x4f, + 0x25, 0xb4, 0x1e, 0x36, 0x10, 0x25, 0xb5, 0xa8, + 0x78, 0x15, 0x39, 0x1f, 0x61, 0x08, 0xfc, 0x4a, + 0xfa, 0x6a, 0x05, 0xd9, 0x30, 0x3c, 0x6b, 0xa6, + 0x8a, 0x12, 0x8a, 0x55, 0x70, 0x5d, 0x41, 0x59, + 0x85, 0x83, 0x2f, 0xde, 0xaa, 0xe6, 0xc8, 0xe1, + 0x91, 0x10, 0xe8, 0x4d, 0x1b, 0x1f, 0x19, 0x9a, + 0x26, 0x92, 0x11, 0x9e, 0xdc, 0x96, 0x13, 0x26, + 0x58, 0xf0, 0x9d, 0xa7, 0xc6, 0x23, 0xef, 0xce, + 0xc7, 0x12, 0x53, 0x7a, 0x3d, 0x94, 0xc0, 0xbf, + 0x5d, 0x7e, 0x35, 0x2e, 0xc9, 0x4a, 0xe5, 0x79, + 0x7f, 0xdb, 0x37, 0x7d, 0xc1, 0x55, 0x11, 0x50, + 0x72, 0x1a, 0xdf, 0x15, 0xbd, 0x26, 0xa8, 0xef, + 0xc2, 0xfc, 0xaa, 0xd5, 0x68, 0x81, 0xfa, 0x9e, + 0x62, 0x46, 0x2c, 0x28, 0xf3, 0x0a, 0xe1, 0xce, + 0xac, 0xa9, 0x3c, 0x34, 0x5c, 0xf2, 0x43, 0xb7, + 0x3f, 0x54, 0x2e, 0x20, 0x74, 0xa7, 0x05, 0xbd, + 0x26, 0x43, 0xbb, 0x9f, 0x7c, 0xc7, 0x9b, 0xb6, + 0xe7, 0x09, 0x1e, 0xa6, 0xe2, 0x32, 0xdf, 0x0f, + 0x9a, 0xd0, 0xd6, 0xcf, 0x50, 0x23, 0x27, 0x87, + 0x6d, 0x82, 0x20, 0x7a, 0xbf, 0x21, 0x15, 0xcd, + 0xac, 0xf6, 0xd5, 0xa4, 0x8f, 0x6c, 0x18, 0x79, + 0xa6, 0x5b, 0x11, 0x5f, 0x0f, 0x8b, 0x3c, 0xb3, + 0xc5, 0x9d, 0x15, 0xdd, 0x8c, 0x76, 0x9b, 0xc0, + 0x14, 0x79, 0x5a, 0x18, 0x37, 0xf3, 0x90, 0x1b, + 0x58, 0x45, 0xeb, 0x49, 0x1a, 0xdf, 0xef, 0xe0, + 0x97, 0xb1, 0xfa, 0x30, 0xa1, 0x2f, 0xc1, 0xf6, + 0x5b, 0xa2, 0x29, 0x05, 0x03, 0x15, 0x39, 0x97, + 0x1a, 0x10, 0xf2, 0xf3, 0x6c, 0x32, 0x1b, 0xb5, + 0x13, 0x31, 0xcd, 0xef, 0xb3, 0x9e, 0x39, 0x64, + 0xc7, 0xef, 0x07, 0x99, 0x94, 0xf5, 0xb6, 0x9b, + 0x2e, 0xdd, 0x83, 0xa7, 0x1e, 0xf5, 0x49, 0x97, + 0x1e, 0xe9, 0x3f, 0x44, 0xea, 0xc3, 0x93, 0x8f, + 0xcd, 0xd6, 0x1d, 0x01, 0xfa, 0x71, 0x79, 0x9d, + 0xa3, 0xa8, 0x09, 0x1c, 0x4c, 0x48, 0xaa, 0x9e, + 0xd2, 0x63, 0xff, 0x07, 0x49, 0xdf, 0x95, 0xd4, + 0x4f, 0xef, 0x6a, 0x0b, 0xb5, 0x78, 0xec, 0x69, + 0x45, 0x6a, 0xa5, 0x40, 0x8a, 0xe3, 0x2c, 0x7a, + 0xf0, 0x8a, 0xd7, 0xba, 0x89, 0x21, 0x28, 0x7e, + 0x3b, 0xbe, 0xe3, 0x1b, 0x76, 0x7b, 0xe0, 0x6a, + 0x0e, 0x70, 0x5c, 0x86, 0x4a, 0x76, 0x91, 0x37, + 0xdf, 0x28, 0x29, 0x22, 0x83, 0xea, 0x81, 0xa2, + 0x48, 0x02, 0x41, 0xb4, 0x4d, 0x99, 0x21, 0xcd, + 0xbe, 0xc1, 0xbc, 0x28, 0xdc, 0x1f, 0xda, 0x11, + 0x4b, 0xd8, 0xe5, 0x21, 0x7a, 0xc9, 0xd8, 0xeb, + 0xaf, 0xa7, 0x20, 0xe9, 0xda, 0x4f, 0x9a, 0xce, + 0x23, 0x1c, 0xc9, 0x49, 0xe5, 0xb9, 0x6f, 0xe7, + 0x6f, 0xfc, 0x21, 0x06, 0x3f, 0xdd, 0xc8, 0x3a, + 0x6b, 0x86, 0x79, 0xc0, 0x0d, 0x35, 0xe0, 0x95, + 0x76, 0xa8, 0x75, 0x30, 0x5b, 0xed, 0x5f, 0x36, + 0xed, 0x24, 0x2c, 0x89, 0x00, 0xdd, 0x1f, 0xa9, + 0x65, 0xbc, 0x95, 0x0d, 0xfc, 0xe0, 0x9b, 0x13, + 0x22, 0x63, 0xa1, 0xee, 0xf5, 0x2d, 0xd6, 0x88, + 0x8c, 0x30, 0x9f, 0x5a, 0x7d, 0x71, 0x28, 0x26 +}; + +/* + * Vector 13 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffffffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5 + * CTX 520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1 + * CTX c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac + * CTX 7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e + * CTX 919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d3 + * CTX 50875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f5 + * CTX 69768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed9 + * CTX 0fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c + * CTX 1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053 + * CTX 027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e7 + * CTX 5d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df + * CTX 73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc00 + * CTX 77ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460 + * CTX d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a4155 + * CTX 6cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88c + * CTX f6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220 + * Plaintext length (bytes): 512 +*/ + +static uint8_t v13_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v13_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v13_TW[16] = { + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v13_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v13_CTX[512] = { + 0xbf, 0x53, 0xd2, 0xda, 0xde, 0x78, 0xe8, 0x22, + 0xa4, 0xd9, 0x49, 0xa9, 0xbc, 0x67, 0x66, 0xb0, + 0x1b, 0x06, 0xa8, 0xef, 0x70, 0xd2, 0x67, 0x48, + 0xc6, 0xa7, 0xfc, 0x36, 0xd8, 0x0a, 0xe4, 0xc5, + 0x52, 0x0f, 0x7c, 0x4a, 0xb0, 0xac, 0x85, 0x44, + 0x42, 0x4f, 0xa4, 0x05, 0x16, 0x2f, 0xef, 0x5a, + 0x6b, 0x7f, 0x22, 0x94, 0x98, 0x06, 0x36, 0x18, + 0xd3, 0x9f, 0x00, 0x03, 0xcb, 0x5f, 0xb8, 0xd1, + 0xc8, 0x6b, 0x64, 0x34, 0x97, 0xda, 0x1f, 0xf9, + 0x45, 0xc8, 0xd3, 0xbe, 0xde, 0xca, 0x4f, 0x47, + 0x97, 0x02, 0xa7, 0xa7, 0x35, 0xf0, 0x43, 0xdd, + 0xb1, 0xd6, 0xaa, 0xad, 0xe3, 0xc4, 0xa0, 0xac, + 0x7c, 0xa7, 0xf3, 0xfa, 0x52, 0x79, 0xbe, 0xf5, + 0x6f, 0x82, 0xcd, 0x7a, 0x2f, 0x38, 0x67, 0x2e, + 0x82, 0x48, 0x14, 0xe1, 0x07, 0x00, 0x30, 0x0a, + 0x05, 0x5e, 0x16, 0x30, 0xb8, 0xf1, 0xcb, 0x0e, + 0x91, 0x9f, 0x5e, 0x94, 0x20, 0x10, 0xa4, 0x16, + 0xe2, 0xbf, 0x48, 0xcb, 0x46, 0x99, 0x3d, 0x3c, + 0xb6, 0xa5, 0x1c, 0x19, 0xba, 0xcf, 0x86, 0x47, + 0x85, 0xa0, 0x0b, 0xc2, 0xec, 0xff, 0x15, 0xd3, + 0x50, 0x87, 0x5b, 0x24, 0x6e, 0xd5, 0x3e, 0x68, + 0xbe, 0x6f, 0x55, 0xbd, 0x7e, 0x05, 0xcf, 0xc2, + 0xb2, 0xed, 0x64, 0x32, 0x19, 0x8a, 0x64, 0x44, + 0xb6, 0xd8, 0xc2, 0x47, 0xfa, 0xb9, 0x41, 0xf5, + 0x69, 0x76, 0x8b, 0x5c, 0x42, 0x93, 0x66, 0xf1, + 0xd3, 0xf0, 0x0f, 0x03, 0x45, 0xb9, 0x61, 0x23, + 0xd5, 0x62, 0x04, 0xc0, 0x1c, 0x63, 0xb2, 0x2c, + 0xe7, 0x8b, 0xaf, 0x11, 0x6e, 0x52, 0x5e, 0xd9, + 0x0f, 0xde, 0xa3, 0x9f, 0xa4, 0x69, 0x49, 0x4d, + 0x38, 0x66, 0xc3, 0x1e, 0x05, 0xf2, 0x95, 0xff, + 0x21, 0xfe, 0xa8, 0xd4, 0xe6, 0xe1, 0x3d, 0x67, + 0xe4, 0x7c, 0xe7, 0x22, 0xe9, 0x69, 0x8a, 0x1c, + 0x10, 0x48, 0xd6, 0x8e, 0xbc, 0xde, 0x76, 0xb8, + 0x6f, 0xcf, 0x97, 0x6e, 0xab, 0x8a, 0xa9, 0x79, + 0x02, 0x68, 0xb7, 0x06, 0x8e, 0x01, 0x7a, 0x8b, + 0x9b, 0x74, 0x94, 0x09, 0x51, 0x4f, 0x10, 0x53, + 0x02, 0x7f, 0xd1, 0x6c, 0x37, 0x86, 0xea, 0x1b, + 0xac, 0x5f, 0x15, 0xcb, 0x79, 0x71, 0x1e, 0xe2, + 0xab, 0xe8, 0x2f, 0x5c, 0xf8, 0xb1, 0x3a, 0xe7, + 0x30, 0x30, 0xef, 0x5b, 0x9e, 0x44, 0x57, 0xe7, + 0x5d, 0x13, 0x04, 0xf9, 0x88, 0xd6, 0x2d, 0xd6, + 0xfc, 0x4b, 0x94, 0xed, 0x38, 0xba, 0x83, 0x1d, + 0xa4, 0xb7, 0x63, 0x49, 0x71, 0xb6, 0xcd, 0x8e, + 0xc3, 0x25, 0xd9, 0xc6, 0x1c, 0x00, 0xf1, 0xdf, + 0x73, 0x62, 0x7e, 0xd3, 0x74, 0x5a, 0x5e, 0x84, + 0x89, 0xf3, 0xa9, 0x5c, 0x69, 0x63, 0x9c, 0x32, + 0xcd, 0x6e, 0x1d, 0x53, 0x7a, 0x85, 0xf7, 0x5c, + 0xc8, 0x44, 0x72, 0x6e, 0x8a, 0x72, 0xfc, 0x00, + 0x77, 0xad, 0x22, 0x00, 0x0f, 0x1d, 0x50, 0x78, + 0xf6, 0xb8, 0x66, 0x31, 0x8c, 0x66, 0x8f, 0x1a, + 0xd0, 0x3d, 0x5a, 0x5f, 0xce, 0xd5, 0x21, 0x9f, + 0x2e, 0xab, 0xbd, 0x0a, 0xa5, 0xc0, 0xf4, 0x60, + 0xd1, 0x83, 0xf0, 0x44, 0x04, 0xa0, 0xd6, 0xf4, + 0x69, 0x55, 0x8e, 0x81, 0xfa, 0xb2, 0x4a, 0x16, + 0x79, 0x05, 0xab, 0x4c, 0x78, 0x78, 0x50, 0x2a, + 0xd3, 0xe3, 0x8f, 0xdb, 0xe6, 0x2a, 0x41, 0x55, + 0x6c, 0xec, 0x37, 0x32, 0x57, 0x59, 0x53, 0x3c, + 0xe8, 0xf2, 0x5f, 0x36, 0x7c, 0x87, 0xbb, 0x55, + 0x78, 0xd6, 0x67, 0xae, 0x93, 0xf9, 0xe2, 0xfd, + 0x99, 0xbc, 0xbc, 0x5f, 0x2f, 0xbb, 0xa8, 0x8c, + 0xf6, 0x51, 0x61, 0x39, 0x42, 0x0f, 0xcf, 0xf3, + 0xb7, 0x36, 0x1d, 0x86, 0x32, 0x2c, 0x4b, 0xd8, + 0x4c, 0x82, 0xf3, 0x35, 0xab, 0xb1, 0x52, 0xc4, + 0xa9, 0x34, 0x11, 0x37, 0x3a, 0xaa, 0x82, 0x20 +}; + +/* + * Vector 14 + * Key1 2718281828459045235360287471352662497757247093699959574966967627 + * Key2 3141592653589793238462643383279502884197169399375105820974944592 + * Data Unit Sequence Number ffffffffff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f + * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f + * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f + * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f + * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f + * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf + * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf + * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + * CTX 64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed23 + * CTX 50eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b80 + * CTX 8da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604 + * CTX e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4 + * CTX a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b + * CTX 50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434 + * CTX 631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d8 + * CTX 76c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff + * CTX 3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41b + * CTX f6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f6 + * CTX 85b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa + * CTX 6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abc + * CTX e19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a + * CTX 0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd9272120546 + * CTX 81d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de + * CTX 6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9 + * Plaintext length (bytes): 512 +*/ + +static uint8_t v14_key1[32] = { + 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45, + 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26, + 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69, + 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27 +}; + +static uint8_t v14_key2[32] = { + 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93, + 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95, + 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37, + 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92 +}; + +static uint8_t v14_TW[16] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static uint8_t v14_PTX[512] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff +}; + +static uint8_t v14_CTX[512] = { + 0x64, 0x49, 0x7e, 0x5a, 0x83, 0x1e, 0x4a, 0x93, + 0x2c, 0x09, 0xbe, 0x3e, 0x53, 0x93, 0x37, 0x6d, + 0xaa, 0x59, 0x95, 0x48, 0xb8, 0x16, 0x03, 0x1d, + 0x22, 0x4b, 0xbf, 0x50, 0xa8, 0x18, 0xed, 0x23, + 0x50, 0xea, 0xe7, 0xe9, 0x60, 0x87, 0xc8, 0xa0, + 0xdb, 0x51, 0xad, 0x29, 0x0b, 0xd0, 0x0c, 0x1a, + 0xc1, 0x62, 0x08, 0x57, 0x63, 0x5b, 0xf2, 0x46, + 0xc1, 0x76, 0xab, 0x46, 0x3b, 0xe3, 0x0b, 0x80, + 0x8d, 0xa5, 0x48, 0x08, 0x1a, 0xc8, 0x47, 0xb1, + 0x58, 0xe1, 0x26, 0x4b, 0xe2, 0x5b, 0xb0, 0x91, + 0x0b, 0xbc, 0x92, 0x64, 0x71, 0x08, 0x08, 0x94, + 0x15, 0xd4, 0x5f, 0xab, 0x1b, 0x3d, 0x26, 0x04, + 0xe8, 0xa8, 0xef, 0xf1, 0xae, 0x40, 0x20, 0xcf, + 0xa3, 0x99, 0x36, 0xb6, 0x68, 0x27, 0xb2, 0x3f, + 0x37, 0x1b, 0x92, 0x20, 0x0b, 0xe9, 0x02, 0x51, + 0xe6, 0xd7, 0x3c, 0x5f, 0x86, 0xde, 0x5f, 0xd4, + 0xa9, 0x50, 0x78, 0x19, 0x33, 0xd7, 0x9a, 0x28, + 0x27, 0x2b, 0x78, 0x2a, 0x2e, 0xc3, 0x13, 0xef, + 0xdf, 0xcc, 0x06, 0x28, 0xf4, 0x3d, 0x74, 0x4c, + 0x2d, 0xc2, 0xff, 0x3d, 0xcb, 0x66, 0x99, 0x9b, + 0x50, 0xc7, 0xca, 0x89, 0x5b, 0x0c, 0x64, 0x79, + 0x1e, 0xea, 0xa5, 0xf2, 0x94, 0x99, 0xfb, 0x1c, + 0x02, 0x6f, 0x84, 0xce, 0x5b, 0x5c, 0x72, 0xba, + 0x10, 0x83, 0xcd, 0xdb, 0x5c, 0xe4, 0x54, 0x34, + 0x63, 0x16, 0x65, 0xc3, 0x33, 0xb6, 0x0b, 0x11, + 0x59, 0x3f, 0xb2, 0x53, 0xc5, 0x17, 0x9a, 0x2c, + 0x8d, 0xb8, 0x13, 0x78, 0x2a, 0x00, 0x48, 0x56, + 0xa1, 0x65, 0x30, 0x11, 0xe9, 0x3f, 0xb6, 0xd8, + 0x76, 0xc1, 0x83, 0x66, 0xdd, 0x86, 0x83, 0xf5, + 0x34, 0x12, 0xc0, 0xc1, 0x80, 0xf9, 0xc8, 0x48, + 0x59, 0x2d, 0x59, 0x3f, 0x86, 0x09, 0xca, 0x73, + 0x63, 0x17, 0xd3, 0x56, 0xe1, 0x3e, 0x2b, 0xff, + 0x3a, 0x9f, 0x59, 0xcd, 0x9a, 0xeb, 0x19, 0xcd, + 0x48, 0x25, 0x93, 0xd8, 0xc4, 0x61, 0x28, 0xbb, + 0x32, 0x42, 0x3b, 0x37, 0xa9, 0xad, 0xfb, 0x48, + 0x2b, 0x99, 0x45, 0x3f, 0xbe, 0x25, 0xa4, 0x1b, + 0xf6, 0xfe, 0xb4, 0xaa, 0x0b, 0xef, 0x5e, 0xd2, + 0x4b, 0xf7, 0x3c, 0x76, 0x29, 0x78, 0x02, 0x54, + 0x82, 0xc1, 0x31, 0x15, 0xe4, 0x01, 0x5a, 0xac, + 0x99, 0x2e, 0x56, 0x13, 0xa3, 0xb5, 0xc2, 0xf6, + 0x85, 0xb8, 0x47, 0x95, 0xcb, 0x6e, 0x9b, 0x26, + 0x56, 0xd8, 0xc8, 0x81, 0x57, 0xe5, 0x2c, 0x42, + 0xf9, 0x78, 0xd8, 0x63, 0x4c, 0x43, 0xd0, 0x6f, + 0xea, 0x92, 0x8f, 0x28, 0x22, 0xe4, 0x65, 0xaa, + 0x65, 0x76, 0xe9, 0xbf, 0x41, 0x93, 0x84, 0x50, + 0x6c, 0xc3, 0xce, 0x3c, 0x54, 0xac, 0x1a, 0x6f, + 0x67, 0xdc, 0x66, 0xf3, 0xb3, 0x01, 0x91, 0xe6, + 0x98, 0x38, 0x0b, 0xc9, 0x99, 0xb0, 0x5a, 0xbc, + 0xe1, 0x9d, 0xc0, 0xc6, 0xdc, 0xc2, 0xdd, 0x00, + 0x1e, 0xc5, 0x35, 0xba, 0x18, 0xde, 0xb2, 0xdf, + 0x1a, 0x10, 0x10, 0x23, 0x10, 0x83, 0x18, 0xc7, + 0x5d, 0xc9, 0x86, 0x11, 0xa0, 0x9d, 0xc4, 0x8a, + 0x0a, 0xcd, 0xec, 0x67, 0x6f, 0xab, 0xdf, 0x22, + 0x2f, 0x07, 0xe0, 0x26, 0xf0, 0x59, 0xb6, 0x72, + 0xb5, 0x6e, 0x5c, 0xbc, 0x8e, 0x1d, 0x21, 0xbb, + 0xd8, 0x67, 0xdd, 0x92, 0x72, 0x12, 0x05, 0x46, + 0x81, 0xd7, 0x0e, 0xa7, 0x37, 0x13, 0x4c, 0xdf, + 0xce, 0x93, 0xb6, 0xf8, 0x2a, 0xe2, 0x24, 0x23, + 0x27, 0x4e, 0x58, 0xa0, 0x82, 0x1c, 0xc5, 0x50, + 0x2e, 0x2d, 0x0a, 0xb4, 0x58, 0x5e, 0x94, 0xde, + 0x69, 0x75, 0xbe, 0x5e, 0x0b, 0x4e, 0xfc, 0xe5, + 0x1c, 0xd3, 0xe7, 0x0c, 0x25, 0xa1, 0xfb, 0xbb, + 0xd6, 0x09, 0xd2, 0x73, 0xad, 0x5b, 0x0d, 0x59, + 0x63, 0x1c, 0x53, 0x1f, 0x6a, 0x0a, 0x57, 0xb9 +}; + +// +// Define vector of structs, with pointers to the statically defined vectors + +struct xts_vector vlist[NVEC] = { + + // pointers to the statically defined vectors here + + // Vector 10 + {sizeof(v10_CTX), v10_key1, v10_key2, v10_TW, v10_PTX, v10_CTX} + , + // Vector 11 + {sizeof(v11_CTX), v11_key1, v11_key2, v11_TW, v11_PTX, v11_CTX} + , + // Vector 12 + {sizeof(v12_CTX), v12_key1, v12_key2, v12_TW, v12_PTX, v12_CTX} + , + // Vector 13 + {sizeof(v13_CTX), v13_key1, v13_key2, v13_TW, v13_PTX, v13_CTX} + , + // Vector 14 + {sizeof(v14_CTX), v14_key1, v14_key2, v14_TW, v14_PTX, v14_CTX} + +}; diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm new file mode 100644 index 000000000..416da1e7b --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm @@ -0,0 +1,78 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern XTS_AES_128_enc_sse +extern XTS_AES_128_enc_avx + +extern XTS_AES_128_enc_expanded_key_sse +extern XTS_AES_128_enc_expanded_key_avx + +extern XTS_AES_128_dec_sse +extern XTS_AES_128_dec_avx + +extern XTS_AES_128_dec_expanded_key_sse +extern XTS_AES_128_dec_expanded_key_avx + +%if (AS_FEATURE_LEVEL) >= 10 +extern XTS_AES_128_enc_vaes +extern XTS_AES_128_enc_expanded_key_vaes +extern XTS_AES_128_dec_vaes +extern XTS_AES_128_dec_expanded_key_vaes +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate XTS_AES_128_enc, XTS_AES_128_enc_expanded_key, XTS_AES_128_dec, and XTS_AES_128_dec_expanded_key +;;;; +mbin_interface XTS_AES_128_enc +mbin_dispatch_init7 XTS_AES_128_enc, XTS_AES_128_enc_sse, XTS_AES_128_enc_sse, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_vaes + +mbin_interface XTS_AES_128_enc_expanded_key +mbin_dispatch_init7 XTS_AES_128_enc_expanded_key, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_vaes + +mbin_interface XTS_AES_128_dec +mbin_dispatch_init7 XTS_AES_128_dec, XTS_AES_128_dec_sse, XTS_AES_128_dec_sse, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_vaes + +mbin_interface XTS_AES_128_dec_expanded_key +mbin_dispatch_init7 XTS_AES_128_dec_expanded_key, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_vaes + + +;;; func core, ver, snum +slversion XTS_AES_128_enc, 01, 04, 0071 +slversion XTS_AES_128_enc_expanded_key, 01, 04, 0072 +slversion XTS_AES_128_dec, 01, 04, 0073 +slversion XTS_AES_128_dec_expanded_key, 01, 04, 0074 diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm new file mode 100644 index 000000000..33f376d5c --- /dev/null +++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm @@ -0,0 +1,78 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2016 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +default rel +[bits 64] + +%include "reg_sizes.asm" + +extern XTS_AES_256_enc_sse +extern XTS_AES_256_enc_avx + +extern XTS_AES_256_enc_expanded_key_sse +extern XTS_AES_256_enc_expanded_key_avx + +extern XTS_AES_256_dec_sse +extern XTS_AES_256_dec_avx + +extern XTS_AES_256_dec_expanded_key_sse +extern XTS_AES_256_dec_expanded_key_avx + +%if (AS_FEATURE_LEVEL) >= 10 +extern XTS_AES_256_enc_vaes +extern XTS_AES_256_enc_expanded_key_vaes +extern XTS_AES_256_dec_vaes +extern XTS_AES_256_dec_expanded_key_vaes +%endif + +section .text + +%include "multibinary.asm" + +;;;; +; instantiate XTS_AES_256_enc, XTS_AES_256_enc_expanded_key, XTS_AES_256_dec, and XTS_AES_256_dec_expanded_key +;;;; +mbin_interface XTS_AES_256_enc +mbin_dispatch_init7 XTS_AES_256_enc, XTS_AES_256_enc_sse, XTS_AES_256_enc_sse, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_vaes + +mbin_interface XTS_AES_256_enc_expanded_key +mbin_dispatch_init7 XTS_AES_256_enc_expanded_key, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_vaes + +mbin_interface XTS_AES_256_dec +mbin_dispatch_init7 XTS_AES_256_dec, XTS_AES_256_dec_sse, XTS_AES_256_dec_sse, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_vaes + +mbin_interface XTS_AES_256_dec_expanded_key +mbin_dispatch_init7 XTS_AES_256_dec_expanded_key, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_vaes + + +;;; func core, ver, snum +slversion XTS_AES_256_enc, 01, 04, 0076 +slversion XTS_AES_256_enc_expanded_key, 01, 04, 0077 +slversion XTS_AES_256_dec, 01, 04, 0078 +slversion XTS_AES_256_dec_expanded_key, 01, 04, 0079 |